Merge commit 'v2.6.29-rc1' into perfcounters/core

Conflicts: include/linux/kernel_stat.h
author: Ingo Molnar <mingo@elte.hu> 2009-01-10 20:42:53 -0500
committer: Ingo Molnar <mingo@elte.hu> 2009-01-10 20:42:53 -0500
commit: 506c10f26c481b7f8ef27c1c79290f68989b2e9e (patch)
tree: 03de82e812f00957aa6276dac2fe51c3358e88d7 /fs
parent: e1df957670aef74ffd9a4ad93e6d2c90bf6b4845 (diff)
parent: c59765042f53a79a7a65585042ff463b69cb248c (diff)
526 files changed, 77358 insertions, 22493 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 522469a7eca3..51307b0fdf0f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -189,6 +189,8 @@ config OCFS2_FS
        select CONFIGFS_FS
        select JBD2
        select CRC32
+        select QUOTA
+        select QUOTA_TREE
        help
          OCFS2 is a general purpose extent based shared disk cluster file
          system with many similarities to ext3. It supports 64 bit inode
@@ -258,56 +260,37 @@ config OCFS2_DEBUG_FS
          this option for debugging only as it is likely to decrease
          performance of the filesystem.
-config OCFS2_COMPAT_JBD
+config OCFS2_FS_POSIX_ACL
-        bool "Use JBD for compatibility"
+        bool "OCFS2 POSIX Access Control Lists"
        depends on OCFS2_FS
+        select FS_POSIX_ACL
        default n
-        select JBD
        help
-          The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
+          Posix Access Control Lists (ACLs) support permissions for users and
-          is backwards compatible with JBD.  It is safe to say N here.
+          groups beyond the owner/group/world scheme.
-          However, if you really want to use the original JBD, say Y here.
-endif # BLOCK
-config DNOTIFY
+config BTRFS_FS
-        bool "Dnotify support"
+        tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
-        default y
+        depends on EXPERIMENTAL
+        select LIBCRC32C
+        select ZLIB_INFLATE
+        select ZLIB_DEFLATE
        help
-          Dnotify is a directory-based per-fd file change notification system
+          Btrfs is a new filesystem with extents, writable snapshotting,
-          that uses signals to communicate events to user-space.  There exist
+          support for multiple devices and many more features.
-          superior alternatives, but some applications may still rely on
-          dnotify.
-          If unsure, say Y.
-config INOTIFY
-        bool "Inotify file change notification support"
-        default y
-        ---help---
-          Say Y here to enable inotify support.  Inotify is a file change
-          notification system and a replacement for dnotify.  Inotify fixes
-          numerous shortcomings in dnotify and introduces several new features
-          including multiple file events, one-shot support, and unmount
-          notification.
-          For more information, see <file:Documentation/filesystems/inotify.txt>
+          Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+          FINALIZED.  You should say N here unless you are interested in
+          testing Btrfs with non-critical data.
-          If unsure, say Y.
+          To compile this file system support as a module, choose M here. The
+          module will be called btrfs.
-config INOTIFY_USER
+          If unsure, say N.
-        bool "Inotify support for userspace"
-        depends on INOTIFY
-        default y
-        ---help---
-          Say Y here to enable inotify support for userspace, including the
-          associated system calls.  Inotify allows monitoring of both files and
-          directories via a single open fd.  Events are read from the file
-          descriptor, which is also select()- and poll()-able.
-          For more information, see <file:Documentation/filesystems/inotify.txt>
+endif # BLOCK
-          If unsure, say Y.
+source "fs/notify/Kconfig"
 config QUOTA
        bool "Quota support"
@@ -340,6 +323,10 @@ config PRINT_QUOTA_WARNING
          Note that this behavior is currently deprecated and may go away in
          future. Please use notification via netlink socket instead.
+# Generic support for tree structured quota files. Seleted when needed.
+config QUOTA_TREE
+         tristate
 config QFMT_V1
        tristate "Old quota format support"
        depends on QUOTA
@@ -351,6 +338,7 @@ config QFMT_V1
 config QFMT_V2
        tristate "Quota format v2 support"
        depends on QUOTA
+        select QUOTA_TREE
        help
          This quota format allows using quotas with 32-bit UIDs/GIDs. If you
          need this functionality say Y here.
@@ -752,7 +740,20 @@ config CONFIGFS_FS
 endmenu
-menu "Miscellaneous filesystems"
+menuconfig MISC_FILESYSTEMS
+        bool "Miscellaneous filesystems"
+        default y
+        ---help---
+          Say Y here to get to see options for various miscellaneous
+          filesystems, such as filesystems that came from other
+          operating systems.
+          This option alone does not add any kernel code.
+          If you say N, all options in this submenu will be skipped and
+          disabled; if unsure, say Y here.
+if MISC_FILESYSTEMS
 config ADFS_FS
        tristate "ADFS file system support (EXPERIMENTAL)"
@@ -931,6 +932,58 @@ config CRAMFS
          If unsure, say N.
+config SQUASHFS
+        tristate "SquashFS 4.0 - Squashed file system support"
+        depends on BLOCK
+        select ZLIB_INFLATE
+        help
+          Saying Y here includes support for SquashFS 4.0 (a Compressed
+          Read-Only File System).  Squashfs is a highly compressed read-only
+          filesystem for Linux.  It uses zlib compression to compress both
+          files, inodes and directories.  Inodes in the system are very small
+          and all blocks are packed to minimise data overhead. Block sizes
+          greater than 4K are supported up to a maximum of 1 Mbytes (default
+          block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
+          (larger than 4GB), full uid/gid information, hard links and
+          timestamps.  
+          Squashfs is intended for general read-only filesystem use, for
+          archival use (i.e. in cases where a .tar.gz file may be used), and in
+          embedded systems where low overhead is needed.  Further information
+          and tools are available from http://squashfs.sourceforge.net.
+          If you want to compile this as a module ( = code which can be
+          inserted in and removed from the running kernel whenever you want),
+          say M here and read <file:Documentation/modules.txt>.  The module
+          will be called squashfs.  Note that the root file system (the one
+          containing the directory /) cannot be compiled as a module.
+          If unsure, say N.
+config SQUASHFS_EMBEDDED
+        bool "Additional option for memory-constrained systems" 
+        depends on SQUASHFS
+        default n
+        help
+          Saying Y here allows you to specify cache size.
+          If unsure, say N.
+config SQUASHFS_FRAGMENT_CACHE_SIZE
+        int "Number of fragments cached" if SQUASHFS_EMBEDDED
+        depends on SQUASHFS
+        default "3"
+        help
+          By default SquashFS caches the last 3 fragments read from
+          the filesystem.  Increasing this amount may mean SquashFS
+          has to re-read fragments less often from disk, at the expense
+          of extra system memory.  Decreasing this amount will mean
+          SquashFS uses less memory at the expense of extra reads from disk.
+          Note there must be at least one cached fragment.  Anything
+          much more than three will probably not make much difference.
 config VXFS_FS
        tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
        depends on BLOCK
@@ -1122,7 +1175,7 @@ config UFS_DEBUG
          Y here.  This will result in _many_ additional debugging messages to be
          written to the system log.
-endmenu
+endif # MISC_FILESYSTEMS
 menuconfig NETWORK_FILESYSTEMS
        bool "Network File Systems"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index ce9fb3fbfae4..bb4cc5b8abc8 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -43,7 +43,7 @@ config BINFMT_ELF_FDPIC
 config CORE_DUMP_DEFAULT_ELF_HEADERS
        bool "Write ELF core dumps with partial segments"
        default n
-        depends on BINFMT_ELF
+        depends on BINFMT_ELF && ELF_CORE
        help
          ELF core dump files describe each memory mapping of the crashed
          process, and can contain or omit the memory contents of each one.
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe6f0c4..38bc735c67ad 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -20,8 +20,7 @@ obj-y +=	no-block.o
 endif
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
-obj-$(CONFIG_INOTIFY)           += inotify.o
+obj-y                           += notify/
-obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
 obj-$(CONFIG_EPOLL)             += eventpoll.o
 obj-$(CONFIG_ANON_INODES)       += anon_inodes.o
 obj-$(CONFIG_SIGNALFD)          += signalfd.o
@@ -55,10 +54,9 @@ obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_QUOTA)             += dquot.o
 obj-$(CONFIG_QFMT_V1)           += quota_v1.o
 obj-$(CONFIG_QFMT_V2)           += quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)        += quota_tree.o
 obj-$(CONFIG_QUOTACTL)          += quota.o
-obj-$(CONFIG_DNOTIFY)           += dnotify.o
 obj-$(CONFIG_PROC_FS)           += proc/
 obj-y                           += partitions/
 obj-$(CONFIG_SYSFS)             += sysfs/
@@ -76,6 +74,7 @@ obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)              += jbd2/
 obj-$(CONFIG_EXT2_FS)           += ext2/
 obj-$(CONFIG_CRAMFS)            += cramfs/
+obj-$(CONFIG_SQUASHFS)          += squashfs/
 obj-y                           += ramfs/
 obj-$(CONFIG_HUGETLBFS)         += hugetlbfs/
 obj-$(CONFIG_CODA_FS)           += coda/
@@ -121,4 +120,5 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)             += hppfs/
 obj-$(CONFIG_DEBUG_FS)          += debugfs/
 obj-$(CONFIG_OCFS2_FS)          += ocfs2/
+obj-$(CONFIG_BTRFS_FS)          += btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1377b1240b6e..9246cb4aa018 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
        }
        index = pos >> PAGE_CACHE_SHIFT;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac16..3c4ec7d864c4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
                goto bad_inode;
 #else
                inode->i_mode |= S_IFDIR;
-                inode->i_op = NULL;
+                /* ... and leave ->i_op and ->i_fop pointing to empty */
-                inode->i_fop = NULL;
                break;
 #endif
        case ST_LINKFILE:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d6b85dab35fc..3fb36d433621 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
        candidate->state = AFS_WBACK_PENDING;
        init_waitqueue_head(&candidate->waitq);
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                kfree(candidate);
                return -ENOMEM;
diff --git a/fs/aio.c b/fs/aio.c
index f658441d5666..d6f89d3c15e8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx)
        kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
 } while(0)
+static void ctx_rcu_free(struct rcu_head *head)
+{
+        struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+        unsigned nr_events = ctx->max_reqs;
+        kmem_cache_free(kioctx_cachep, ctx);
+        if (nr_events) {
+                spin_lock(&aio_nr_lock);
+                BUG_ON(aio_nr - nr_events > aio_nr);
+                aio_nr -= nr_events;
+                spin_unlock(&aio_nr_lock);
+        }
+}
 /* __put_ioctx
 *      Called when the last user of an aio context has gone away,
@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 */
 static void __put_ioctx(struct kioctx *ctx)
 {
-        unsigned nr_events = ctx->max_reqs;
        BUG_ON(ctx->reqs_active);
        cancel_delayed_work(&ctx->wq);
@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx)
        mmdrop(ctx->mm);
        ctx->mm = NULL;
        pr_debug("__put_ioctx: freeing %p\n", ctx);
-        kmem_cache_free(kioctx_cachep, ctx);
+        call_rcu(&ctx->rcu_head, ctx_rcu_free);
-        if (nr_events) {
-                spin_lock(&aio_nr_lock);
-                BUG_ON(aio_nr - nr_events > aio_nr);
-                aio_nr -= nr_events;
-                spin_unlock(&aio_nr_lock);
-        }
 }
 #define get_ioctx(kioctx) do {                                          \
@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
        struct mm_struct *mm;
        struct kioctx *ctx;
+        int did_sync = 0;
        /* Prevent overflows */
        if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
                goto out_freectx;
        /* limit the number of system wide aios */
-        spin_lock(&aio_nr_lock);
+        do {
-        if (aio_nr + ctx->max_reqs > aio_max_nr ||
+                spin_lock_bh(&aio_nr_lock);
-            aio_nr + ctx->max_reqs < aio_nr)
+                if (aio_nr + nr_events > aio_max_nr ||
-                ctx->max_reqs = 0;
+                    aio_nr + nr_events < aio_nr)
-        else
+                        ctx->max_reqs = 0;
-                aio_nr += ctx->max_reqs;
+                else
-        spin_unlock(&aio_nr_lock);
+                        aio_nr += ctx->max_reqs;
+                spin_unlock_bh(&aio_nr_lock);
+                if (ctx->max_reqs || did_sync)
+                        break;
+                /* wait for rcu callbacks to have completed before giving up */
+                synchronize_rcu();
+                did_sync = 1;
+                ctx->max_reqs = nr_events;
+        } while (1);
        if (ctx->max_reqs == 0)
                goto out_cleanup;
        /* now link into global list. */
-        write_lock(&mm->ioctx_list_lock);
+        spin_lock(&mm->ioctx_lock);
-        ctx->next = mm->ioctx_list;
+        hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
-        mm->ioctx_list = ctx;
+        spin_unlock(&mm->ioctx_lock);
-        write_unlock(&mm->ioctx_list_lock);
        dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
                ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
 */
 void exit_aio(struct mm_struct *mm)
 {
-        struct kioctx *ctx = mm->ioctx_list;
+        struct kioctx *ctx;
-        mm->ioctx_list = NULL;
-        while (ctx) {
+        while (!hlist_empty(&mm->ioctx_list)) {
-                struct kioctx *next = ctx->next;
+                ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
-                ctx->next = NULL;
+                hlist_del_rcu(&ctx->list);
                aio_cancel_all(ctx);
                wait_for_all_aios(ctx);
@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm)
                                atomic_read(&ctx->users), ctx->dead,
                                ctx->reqs_active);
                put_ioctx(ctx);
-                ctx = next;
        }
 }
@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req)
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
-        struct kioctx *ioctx;
+        struct mm_struct *mm = current->mm;
-        struct mm_struct *mm;
+        struct kioctx *ctx = NULL;
+        struct hlist_node *n;
-        mm = current->mm;
+        rcu_read_lock();
-        read_lock(&mm->ioctx_list_lock);
-        for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
+        hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-                if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
+                if (ctx->user_id == ctx_id && !ctx->dead) {
-                        get_ioctx(ioctx);
+                        get_ioctx(ctx);
                        break;
                }
-        read_unlock(&mm->ioctx_list_lock);
+        }
-        return ioctx;
+        rcu_read_unlock();
+        return ctx;
 }
 /*
@@ -1215,19 +1232,14 @@ out:
 static void io_destroy(struct kioctx *ioctx)
 {
        struct mm_struct *mm = current->mm;
-        struct kioctx **tmp;
        int was_dead;
        /* delete the entry from the list is someone else hasn't already */
-        write_lock(&mm->ioctx_list_lock);
+        spin_lock(&mm->ioctx_lock);
        was_dead = ioctx->dead;
        ioctx->dead = 1;
-        for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx;
+        hlist_del_rcu(&ioctx->list);
-             tmp = &(*tmp)->next)
+        spin_unlock(&mm->ioctx_lock);
-                ;
-        if (*tmp)
-                *tmp = ioctx->next;
-        write_unlock(&mm->ioctx_list_lock);
        dprintk("aio_release(%p)\n", ioctx);
        if (likely(!was_dead))
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index c16d9be1b017..3bbdb9d02376 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
        if (IS_ERR(anon_inode_inode))
                return -ENODEV;
+        if (fops->owner && !try_module_get(fops->owner))
+                return -ENOENT;
        error = get_unused_fd_flags(flags);
        if (error < 0)
-                return error;
+                goto err_module;
        fd = error;
        /*
@@ -128,6 +131,8 @@ err_dput:
        dput(dentry);
 err_put_unused_fd:
        put_unused_fd(fd);
+err_module:
+        module_put(fops->owner);
        return error;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c60..e1734f2d6e26 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_nlink = 2;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_blocks = 0;
        if (ino == AUTOFS_ROOT_INO) {
                inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
                inode->i_op = &autofs_root_inode_operations;
                inode->i_fop = &autofs_root_operations;
-                inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
                goto done;
        } 
        
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index e0f16da00e54..a76803108d06 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -25,8 +25,6 @@
 #define AUTOFS_DEV_IOCTL_IOC_FIRST      (AUTOFS_DEV_IOCTL_VERSION)
 #define AUTOFS_DEV_IOCTL_IOC_COUNT      (AUTOFS_IOC_COUNT - 11)
-#define AUTOFS_TYPE_TRIGGER     (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/time.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 63b7c7afe8df..025e105bffea 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
 /*
 * Check sanity of parameter control fields and if a path is present
- * check that it has a "/" and is terminated.
+ * check that it is terminated and contains at least one "/".
 */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
        }
        if (param->size > sizeof(*param)) {
-                err = check_name(param->path);
+                err = invalid_str(param->path,
+                                 (void *) ((size_t) param + param->size));
                if (err) {
-                        AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
+                        AUTOFS_WARN(
-                                    cmd);
+                          "path string terminator missing for cmd(0x%08x)",
+                          cmd);
                        goto out;
                }
-                err = invalid_str(param->path,
+                err = check_name(param->path);
-                                 (void *) ((size_t) param + param->size));
                if (err) {
                        AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
                                    cmd);
@@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp,
                                     struct autofs_sb_info *sbi,
                                     struct autofs_dev_ioctl *param)
 {
-        param->arg1 = sbi->version;
+        param->protover.version = sbi->version;
        return 0;
 }
@@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
                                        struct autofs_sb_info *sbi,
                                        struct autofs_dev_ioctl *param)
 {
-        param->arg1 = sbi->sub_version;
+        param->protosubver.sub_version = sbi->sub_version;
        return 0;
 }
@@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
        int err, fd;
        /* param->path has already been checked */
-        if (!param->arg1)
+        if (!param->openmount.devid)
                return -EINVAL;
        param->ioctlfd = -1;
        path = param->path;
-        devid = param->arg1;
+        devid = param->openmount.devid;
        err = 0;
        fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
 {
        autofs_wqt_t token;
-        token = (autofs_wqt_t) param->arg1;
+        token = (autofs_wqt_t) param->ready.token;
        return autofs4_wait_release(sbi, token, 0);
 }
@@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp,
        autofs_wqt_t token;
        int status;
-        token = (autofs_wqt_t) param->arg1;
+        token = (autofs_wqt_t) param->fail.token;
-        status = param->arg2 ? param->arg2 : -ENOENT;
+        status = param->fail.status ? param->fail.status : -ENOENT;
        return autofs4_wait_release(sbi, token, status);
 }
@@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
        int pipefd;
        int err = 0;
-        if (param->arg1 == -1)
+        if (param->setpipefd.pipefd == -1)
                return -EINVAL;
-        pipefd = param->arg1;
+        pipefd = param->setpipefd.pipefd;
        mutex_lock(&sbi->wq_mutex);
        if (!sbi->catatonic) {
@@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
 {
        unsigned long timeout;
-        timeout = param->arg1;
+        timeout = param->timeout.timeout;
-        param->arg1 = sbi->exp_timeout / HZ;
+        param->timeout.timeout = sbi->exp_timeout / HZ;
        sbi->exp_timeout = timeout * HZ;
        return 0;
 }
@@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
        path = param->path;
        devid = sbi->sb->s_dev;
-        param->arg1 = param->arg2 = -1;
+        param->requester.uid = param->requester.gid = -1;
        /* Get nameidata of the parent directory */
        err = path_lookup(path, LOOKUP_PARENT, &nd);
@@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                err = 0;
                autofs4_expire_wait(nd.path.dentry);
                spin_lock(&sbi->fs_lock);
-                param->arg1 = ino->uid;
+                param->requester.uid = ino->uid;
-                param->arg2 = ino->gid;
+                param->requester.gid = ino->gid;
                spin_unlock(&sbi->fs_lock);
        }
@@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp,
        int err = -EAGAIN;
        int how;
-        how = param->arg1;
+        how = param->expire.how;
        mnt = fp->f_path.mnt;
-        if (sbi->type & AUTOFS_TYPE_TRIGGER)
+        if (autofs_type_trigger(sbi->type))
                dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
        else
                dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
@@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
                                      struct autofs_sb_info *sbi,
                                      struct autofs_dev_ioctl *param)
 {
-        param->arg1 = 0;
+        param->askumount.may_umount = 0;
        if (may_umount(fp->f_path.mnt))
-                param->arg1 = 1;
+                param->askumount.may_umount = 1;
        return 0;
 }
@@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
        struct nameidata nd;
        const char *path;
        unsigned int type;
+        unsigned int devid, magic;
        int err = -ENOENT;
        if (param->size <= sizeof(*param)) {
@@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
        }
        path = param->path;
-        type = param->arg1;
+        type = param->ismountpoint.in.type;
-        param->arg1 = 0;
+        param->ismountpoint.out.devid = devid = 0;
-        param->arg2 = 0;
+        param->ismountpoint.out.magic = magic = 0;
        if (!fp || param->ioctlfd == -1) {
-                if (type == AUTOFS_TYPE_ANY) {
+                if (autofs_type_any(type)) {
                        struct super_block *sb;
                        err = path_lookup(path, LOOKUP_FOLLOW, &nd);
@@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                                goto out;
                        sb = nd.path.dentry->d_sb;
-                        param->arg1 = new_encode_dev(sb->s_dev);
+                        devid = new_encode_dev(sb->s_dev);
                } else {
                        struct autofs_info *ino;
@@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                                goto out_release;
                        ino = autofs4_dentry_ino(nd.path.dentry);
-                        param->arg1 = autofs4_get_dev(ino->sbi);
+                        devid = autofs4_get_dev(ino->sbi);
                }
                err = 0;
                if (nd.path.dentry->d_inode &&
                    nd.path.mnt->mnt_root == nd.path.dentry) {
                        err = 1;
-                        param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic;
+                        magic = nd.path.dentry->d_inode->i_sb->s_magic;
                }
        } else {
-                dev_t devid = new_encode_dev(sbi->sb->s_dev);
+                dev_t dev = autofs4_get_dev(sbi);
                err = path_lookup(path, LOOKUP_PARENT, &nd);
                if (err)
                        goto out;
-                err = autofs_dev_ioctl_find_super(&nd, devid);
+                err = autofs_dev_ioctl_find_super(&nd, dev);
                if (err)
                        goto out_release;
-                param->arg1 = autofs4_get_dev(sbi);
+                devid = dev;
                err = have_submounts(nd.path.dentry);
                if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
                        if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
                                struct inode *inode = nd.path.dentry->d_inode;
-                                param->arg2 = inode->i_sb->s_magic;
+                                magic = inode->i_sb->s_magic;
                        }
                }
        }
+        param->ismountpoint.out.devid = devid;
+        param->ismountpoint.out.magic = magic;
 out_release:
        path_put(&nd.path);
 out:
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4b6fb3f628c0..e3bd50776f9e 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
                struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
                /* This is an autofs submount, we can't expire it */
-                if (sbi->type == AUTOFS_TYPE_INDIRECT)
+                if (autofs_type_indirect(sbi->type))
                        goto done;
                /*
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
        if (arg && get_user(do_now, arg))
                return -EFAULT;
-        if (sbi->type & AUTOFS_TYPE_TRIGGER)
+        if (autofs_type_trigger(sbi->type))
                dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
        else
                dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef4..716e12b627b2 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
        seq_printf(m, ",minproto=%d", sbi->min_proto);
        seq_printf(m, ",maxproto=%d", sbi->max_proto);
-        if (sbi->type & AUTOFS_TYPE_OFFSET)
+        if (autofs_type_offset(sbi->type))
                seq_printf(m, ",offset");
-        else if (sbi->type & AUTOFS_TYPE_DIRECT)
+        else if (autofs_type_direct(sbi->type))
                seq_printf(m, ",direct");
        else
                seq_printf(m, ",indirect");
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
                        *maxproto = option;
                        break;
                case Opt_indirect:
-                        *type = AUTOFS_TYPE_INDIRECT;
+                        set_autofs_type_indirect(type);
                        break;
                case Opt_direct:
-                        *type = AUTOFS_TYPE_DIRECT;
+                        set_autofs_type_direct(type);
                        break;
                case Opt_offset:
-                        *type = AUTOFS_TYPE_OFFSET;
+                        set_autofs_type_offset(type);
                        break;
                default:
                        return 1;
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        sbi->sb = s;
        sbi->version = 0;
        sbi->sub_version = 0;
-        sbi->type = AUTOFS_TYPE_INDIRECT;
+        set_autofs_type_indirect(&sbi->type);
        sbi->min_proto = 0;
        sbi->max_proto = 0;
        mutex_init(&sbi->wq_mutex);
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
        }
        root_inode->i_fop = &autofs4_root_operations;
-        root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ?
+        root_inode->i_op = autofs_type_trigger(sbi->type) ?
                        &autofs4_direct_root_inode_operations :
                        &autofs4_indirect_root_inode_operations;
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
        if (sb->s_root) {
                inode->i_uid = sb->s_root->d_inode->i_uid;
                inode->i_gid = sb->s_root->d_inode->i_gid;
-        } else {
-                inode->i_uid = 0;
-                inode->i_gid = 0;
        }
-        inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        if (S_ISDIR(inf->mode)) {
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e02cc8ae5eb3..eeb246845909 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                 * is very similar for indirect mounts except only dentrys
                 * in the root of the autofs file system may be negative.
                 */
-                if (sbi->type & AUTOFS_TYPE_TRIGGER)
+                if (autofs_type_trigger(sbi->type))
                        return -ENOENT;
                else if (!IS_ROOT(dentry->d_parent))
                        return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                return -ENOMEM;
        /* If this is a direct mount request create a dummy name */
-        if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER)
+        if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
                qstr.len = sprintf(name, "%p", dentry);
        else {
                qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
                                type = autofs_ptype_expire_multi;
                } else {
                        if (notify == NFY_MOUNT)
-                                type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+                                type = autofs_type_trigger(sbi->type) ?
                                        autofs_ptype_missing_direct :
                                         autofs_ptype_missing_indirect;
                        else
-                                type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+                                type = autofs_type_trigger(sbi->type) ?
                                        autofs_ptype_expire_direct :
                                        autofs_ptype_expire_indirect;
                }
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f1538c03b1b..a05287a23f62 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags)
        return -EIO;
 }
-static int bad_file_dir_notify(struct file *file, unsigned long arg)
-{
-        return -EIO;
-}
 static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
        return -EIO;
@@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops =
        .sendpage       = bad_file_sendpage,
        .get_unmapped_area = bad_file_get_unmapped_area,
        .check_flags    = bad_file_check_flags,
-        .dir_notify     = bad_file_dir_notify,
        .flock          = bad_file_flock,
        .splice_write   = bad_file_splice_write,
        .splice_read    = bad_file_splice_read,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b6dfee37c7b7..d06cb023ad02 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
                inode->i_size = 0;
                inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
                strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
-                        BEFS_SYMLINK_LEN);
+                        BEFS_SYMLINK_LEN - 1);
+                befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
        } else {
                int num_blks;
@@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
                        kfree(link);
                        befs_error(sb, "Failed to read entire long symlink");
                        link = ERR_PTR(-EIO);
+                } else {
+                        link[len - 1] = '\0';
                }
        } else {
                link = befs_ino->i_data.symlink;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 0ed57b5ee012..cc4062d12ca2 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s)
 {
        struct bfs_sb_info *info = BFS_SB(s);
+        if (!info)
+                return;
        brelse(info->si_sbh);
        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        unsigned i, imap_len;
        struct bfs_sb_info *info;
        long ret = -EINVAL;
+        unsigned long i_sblock, i_eblock, i_eoff, s_size;
        info = kzalloc(sizeof(*info), GFP_KERNEL);
        if (!info)
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
        s->s_magic = BFS_MAGIC;
        info->si_sbh = bh;
+        if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+                printf("Superblock is corrupted\n");
+                goto out;
+        }
        info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
                                        sizeof(struct bfs_inode)
                                        + BFS_ROOT_INO - 1;
@@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                        - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
        info->si_freei = 0;
        info->si_lf_eblk = 0;
+        /* can we read the last block? */
+        bh = sb_bread(s, info->si_blocks - 1);
+        if (!bh) {
+                printf("Last block not available: %lu\n", info->si_blocks - 1);
+                iput(inode);
+                ret = -EIO;
+                kfree(info->si_imap);
+                goto out;
+        }
+        brelse(bh);
        bh = NULL;
        for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
                struct bfs_inode *di;
@@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
                di = (struct bfs_inode *)bh->b_data + off;
+                /* test if filesystem is not corrupted */
+                i_eoff = le32_to_cpu(di->i_eoffset);
+                i_sblock = le32_to_cpu(di->i_sblock);
+                i_eblock = le32_to_cpu(di->i_eblock);
+                s_size = le32_to_cpu(bfs_sb->s_end);
+                if (i_sblock > info->si_blocks ||
+                        i_eblock > info->si_blocks ||
+                        i_sblock > i_eblock ||
+                        i_eoff > s_size ||
+                        i_sblock * BFS_BSIZE > i_eoff) {
+                        printf("Inode 0x%08x corrupted\n", i);
+                        brelse(bh);
+                        s->s_root = NULL;
+                        kfree(info->si_imap);
+                        kfree(info);
+                        s->s_fs_info = NULL;
+                        return -EIO;
+                }
                if (!di->i_ino) {
                        info->si_freei++;
                        continue;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f1f3f4192a60..b639dcf7c778 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -95,92 +95,55 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
        int has_dumped = 0;
        unsigned long dump_start, dump_size;
        struct user dump;
-#if defined(__alpha__)
+#ifdef __alpha__
 #       define START_DATA(u)    (u.start_data)
-#elif defined(__arm__)
+#else
 #       define START_DATA(u)    ((u.u_tsize << PAGE_SHIFT) + u.start_code)
-#elif defined(__sparc__)
-#       define START_DATA(u)    (u.u_tsize)
-#elif defined(__i386__) || defined(__mc68000__) || defined(__arch_um__)
-#       define START_DATA(u)    (u.u_tsize << PAGE_SHIFT)
 #endif
-#ifdef __sparc__
-#       define START_STACK(u)   ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1))
-#else
 #       define START_STACK(u)   (u.start_stack)
-#endif
        fs = get_fs();
        set_fs(KERNEL_DS);
        has_dumped = 1;
        current->flags |= PF_DUMPCORE;
        strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
-#ifndef __sparc__
        dump.u_ar0 = offsetof(struct user, regs);
-#endif
        dump.signal = signr;
        aout_dump_thread(regs, &dump);
 /* If the size of the dump file exceeds the rlimit, then see what would happen
   if we wrote the stack, but not the data area.  */
-#ifdef __sparc__
-        if ((dump.u_dsize + dump.u_ssize) > limit)
-                dump.u_dsize = 0;
-#else
        if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit)
                dump.u_dsize = 0;
-#endif
 /* Make sure we have enough room to write the stack and data areas. */
-#ifdef __sparc__
-        if (dump.u_ssize > limit)
-                dump.u_ssize = 0;
-#else
        if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
                dump.u_ssize = 0;
-#endif
 /* make sure we actually have a data and stack area to dump */
        set_fs(USER_DS);
-#ifdef __sparc__
-        if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize))
-                dump.u_dsize = 0;
-        if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize))
-                dump.u_ssize = 0;
-#else
        if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
                dump.u_dsize = 0;
        if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
                dump.u_ssize = 0;
-#endif
        set_fs(KERNEL_DS);
 /* struct user */
        DUMP_WRITE(&dump,sizeof(dump));
 /* Now dump all of the user data.  Include malloced stuff as well */
-#ifndef __sparc__
        DUMP_SEEK(PAGE_SIZE);
-#endif
 /* now we start writing out the user space info */
        set_fs(USER_DS);
 /* Dump the data area */
        if (dump.u_dsize != 0) {
                dump_start = START_DATA(dump);
-#ifdef __sparc__
-                dump_size = dump.u_dsize;
-#else
                dump_size = dump.u_dsize << PAGE_SHIFT;
-#endif
                DUMP_WRITE(dump_start,dump_size);
        }
 /* Now prepare to dump the stack area */
        if (dump.u_ssize != 0) {
                dump_start = START_STACK(dump);
-#ifdef __sparc__
-                dump_size = dump.u_ssize;
-#else
                dump_size = dump.u_ssize << PAGE_SHIFT;
-#endif
                DUMP_WRITE(dump_start,dump_size);
        }
 /* Finally dump the task struct.  Not be used by gdb, but could be useful */
@@ -205,29 +168,24 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
        int envc = bprm->envc;
        sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
-#ifdef __sparc__
-        /* This imposes the proper stack alignment for a new process. */
-        sp = (void __user *) (((unsigned long) sp) & ~7);
-        if ((envc+argc+3)&1) --sp;
-#endif
 #ifdef __alpha__
 /* whee.. test-programs are so much fun. */
        put_user(0, --sp);
        put_user(0, --sp);
        if (bprm->loader) {
                put_user(0, --sp);
-                put_user(0x3eb, --sp);
+                put_user(1003, --sp);
                put_user(bprm->loader, --sp);
-                put_user(0x3ea, --sp);
+                put_user(1002, --sp);
        }
        put_user(bprm->exec, --sp);
-        put_user(0x3e9, --sp);
+        put_user(1001, --sp);
 #endif
        sp -= envc+1;
        envp = (char __user * __user *) sp;
        sp -= argc+1;
        argv = (char __user * __user *) sp;
-#if defined(__i386__) || defined(__mc68000__) || defined(__arm__) || defined(__arch_um__)
+#ifndef __alpha__
        put_user((unsigned long) envp,--sp);
        put_user((unsigned long) argv,--sp);
 #endif
@@ -300,13 +258,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                return retval;
        /* OK, This is the point of no return */
-#if defined(__alpha__)
+#ifdef __alpha__
        SET_AOUT_PERSONALITY(bprm, ex);
-#elif defined(__sparc__)
-        set_personality(PER_SUNOS);
-#if !defined(__sparc_v9__)
-        memcpy(&current->thread.core_exec, &ex, sizeof(struct exec));
-#endif
 #else
        set_personality(PER_LINUX);
 #endif
@@ -322,24 +275,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        install_exec_creds(bprm);
        current->flags &= ~PF_FORKNOEXEC;
-#ifdef __sparc__
-        if (N_MAGIC(ex) == NMAGIC) {
-                loff_t pos = fd_offset;
-                /* Fuck me plenty... */
-                /* <AOL></AOL> */
-                down_write(&current->mm->mmap_sem);     
-                error = do_brk(N_TXTADDR(ex), ex.a_text);
-                up_write(&current->mm->mmap_sem);
-                bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex),
-                          ex.a_text, &pos);
-                down_write(&current->mm->mmap_sem);
-                error = do_brk(N_DATADDR(ex), ex.a_data);
-                up_write(&current->mm->mmap_sem);
-                bprm->file->f_op->read(bprm->file, (char *) N_DATADDR(ex),
-                          ex.a_data, &pos);
-                goto beyond_if;
-        }
-#endif
        if (N_MAGIC(ex) == OMAGIC) {
                unsigned long text_addr, map_size;
@@ -347,7 +282,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
                text_addr = N_TXTADDR(ex);
-#if defined(__alpha__) || defined(__sparc__)
+#ifdef __alpha__
                pos = fd_offset;
                map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
 #else
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af7677..e3ff2b9e602f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        elf_addr_t __user *sp;
        elf_addr_t __user *u_platform;
        elf_addr_t __user *u_base_platform;
+        elf_addr_t __user *u_rand_bytes;
        const char *k_platform = ELF_PLATFORM;
        const char *k_base_platform = ELF_BASE_PLATFORM;
+        unsigned char k_rand_bytes[16];
        int items;
        elf_addr_t *elf_info;
        int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
                        return -EFAULT;
        }
+        /*
+         * Generate 16 random bytes for userspace PRNG seeding.
+         */
+        get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+        u_rand_bytes = (elf_addr_t __user *)
+                       STACK_ALLOC(p, sizeof(k_rand_bytes));
+        if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+                return -EFAULT;
        /* Create the ELF interpreter info */
        elf_info = (elf_addr_t *)current->mm->saved_auxv;
        /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
        NEW_AUX_ENT(AT_GID, cred->gid);
        NEW_AUX_ENT(AT_EGID, cred->egid);
        NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+        NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
        NEW_AUX_ENT(AT_EXECFN, bprm->exec);
        if (k_platform) {
                NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aa5b43205e37..f3e72c5c19f5 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
        struct elf_fdpic_params exec_params, interp_params;
        struct elf_phdr *phdr;
        unsigned long stack_size, entryaddr;
-#ifndef CONFIG_MMU
-        unsigned long fullsize;
-#endif
 #ifdef ELF_FDPIC_PLAT_INIT
        unsigned long dynaddr;
 #endif
@@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
                goto error_kill;
        }
-        /* expand the stack mapping to use up the entire allocation granule */
-        fullsize = kobjsize((char *) current->mm->start_brk);
-        if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
-                                    fullsize, 0, 0)))
-                stack_size = fullsize;
        up_write(&current->mm->mmap_sem);
        current->mm->brk = current->mm->start_brk;
@@ -1567,11 +1559,9 @@ end_coredump:
 static int elf_fdpic_dump_segments(struct file *file, size_t *size,
                           unsigned long *limit, unsigned long mm_flags)
 {
-        struct vm_list_struct *vml;
+        struct vm_area_struct *vma;
-        for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
-        struct vm_area_struct *vma = vml->vma;
+        for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
                if (!maydump(vma, mm_flags))
                        continue;
@@ -1617,9 +1607,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        elf_fpxregset_t *xfpu = NULL;
 #endif
        int thread_status_size = 0;
-#ifndef CONFIG_MMU
-        struct vm_list_struct *vml;
-#endif
        elf_addr_t *auxv;
        unsigned long mm_flags;
@@ -1685,13 +1672,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        fill_prstatus(prstatus, current, signr);
        elf_core_copy_regs(&prstatus->pr_reg, regs);
-#ifdef CONFIG_MMU
        segs = current->mm->map_count;
-#else
-        segs = 0;
-        for (vml = current->mm->context.vmlist; vml; vml = vml->next)
-            segs++;
-#endif
 #ifdef ELF_CORE_EXTRA_PHDRS
        segs += ELF_CORE_EXTRA_PHDRS;
 #endif
@@ -1766,20 +1747,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        mm_flags = current->mm->flags;
        /* write program headers for segments dump */
-        for (
+        for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
-#ifdef CONFIG_MMU
-                vma = current->mm->mmap; vma; vma = vma->vm_next
-#else
-                        vml = current->mm->context.vmlist; vml; vml = vml->next
-#endif
-             ) {
                struct elf_phdr phdr;
                size_t sz;
-#ifndef CONFIG_MMU
-                vma = vml->vma;
-#endif
                sz = vma->vm_end - vma->vm_start;
                phdr.p_type = PT_LOAD;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7bbd5c6b3725..5cebf0b37798 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm,
        unsigned long textpos = 0, datapos = 0, result;
        unsigned long realdatastart = 0;
        unsigned long text_len, data_len, bss_len, stack_len, flags;
-        unsigned long len, reallen, memp = 0;
+        unsigned long len, memp = 0;
-        unsigned long extra, rlim;
+        unsigned long memp_size, extra, rlim;
        unsigned long *reloc = 0, *rp;
        struct inode *inode;
        int i, rev, relocs = 0;
@@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm,
                }
                len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+                len = PAGE_ALIGN(len);
                down_write(&current->mm->mmap_sem);
                realdatastart = do_mmap(0, 0, len,
                        PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
-                /* Remap to use all availabe slack region space */
-                if (realdatastart && (realdatastart < (unsigned long)-4096)) {
-                        reallen = kobjsize((void *)realdatastart);
-                        if (reallen > len) {
-                                realdatastart = do_mremap(realdatastart, len,
-                                        reallen, MREMAP_FIXED, realdatastart);
-                        }
-                }
                up_write(&current->mm->mmap_sem);
                if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
@@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm,
                reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
                memp = realdatastart;
+                memp_size = len;
        } else {
                len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+                len = PAGE_ALIGN(len);
                down_write(&current->mm->mmap_sem);
                textpos = do_mmap(0, 0, len,
                        PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
-                /* Remap to use all availabe slack region space */
-                if (textpos && (textpos < (unsigned long) -4096)) {
-                        reallen = kobjsize((void *)textpos);
-                        if (reallen > len) {
-                                textpos = do_mremap(textpos, len, reallen,
-                                        MREMAP_FIXED, textpos);
-                        }
-                }
                up_write(&current->mm->mmap_sem);
                if (!textpos  || textpos >= (unsigned long) -4096) {
@@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) +
                                MAX_SHARED_LIBS * sizeof(unsigned long));
                memp = textpos;
+                memp_size = len;
 #ifdef CONFIG_BINFMT_ZFLAT
                /*
                 * load it all in and treat it like a RAM load from now on
@@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm,
                 * set up the brk stuff, uses any slack left in data/bss/stack
                 * allocation.  We put the brk after the bss (between the bss
                 * and stack) like other platforms.
+                 * Userspace code relies on the stack pointer starting out at
+                 * an address right at the end of a page.
                 */
                current->mm->start_brk = datapos + data_len + bss_len;
                current->mm->brk = (current->mm->start_brk + 3) & ~3;
-                current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len;
+                current->mm->context.end_brk = memp + memp_size - stack_len;
        }
        if (flags & FLAT_FLAG_KTRACE)
@@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm,
        /* zero the BSS,  BRK and stack areas */
        memset((void*)(datapos + data_len), 0, bss_len + 
-                        (memp + kobjsize((void *) memp) - stack_len -   /* end brk */
+                        (memp + memp_size - stack_len -         /* end brk */
-                        libinfo->lib_list[id].start_brk) +              /* start brk */
+                        libinfo->lib_list[id].start_brk) +      /* start brk */
                        stack_len);
        return 0;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b3..c4e83537ead7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
        if (inode) {
                inode->i_mode = mode;
-                inode->i_uid = 0;
-                inode->i_gid = 0;
-                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime =
                        current_fs_time(inode->i_sb);
        }
@@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-        char *s = enabled ? "enabled" : "disabled";
+        char *s = enabled ? "enabled\n" : "disabled\n";
        return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 19caf7c962ac..77ebc3c263d6 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -111,7 +111,7 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
            && bip->bip_buf != NULL)
                kfree(bip->bip_buf);
-        mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+        bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
        mempool_free(bip, bs->bio_integrity_pool);
        bio->bi_integrity = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index df99c882b807..062299acbccd 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -31,7 +31,11 @@
 DEFINE_TRACE(block_split);
-static struct kmem_cache *bio_slab __read_mostly;
+/*
+ * Test patch to inline a certain number of bi_io_vec's inside the bio
+ * itself, to shrink a bio data allocation from two mempool calls to one
+ */
+#define BIO_INLINE_VECS         4
 static mempool_t *bio_split_pool __read_mostly;
@@ -40,9 +44,8 @@ static mempool_t *bio_split_pool __read_mostly;
 * break badly! cannot be bigger than what you can fit into an
 * unsigned short
 */
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
        BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
@@ -53,12 +56,121 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 */
 struct bio_set *fs_bio_set;
+/*
+ * Our slab pool management
+ */
+struct bio_slab {
+        struct kmem_cache *slab;
+        unsigned int slab_ref;
+        unsigned int slab_size;
+        char name[8];
+};
+static DEFINE_MUTEX(bio_slab_lock);
+static struct bio_slab *bio_slabs;
+static unsigned int bio_slab_nr, bio_slab_max;
+static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
+{
+        unsigned int sz = sizeof(struct bio) + extra_size;
+        struct kmem_cache *slab = NULL;
+        struct bio_slab *bslab;
+        unsigned int i, entry = -1;
+        mutex_lock(&bio_slab_lock);
+        i = 0;
+        while (i < bio_slab_nr) {
+                struct bio_slab *bslab = &bio_slabs[i];
+                if (!bslab->slab && entry == -1)
+                        entry = i;
+                else if (bslab->slab_size == sz) {
+                        slab = bslab->slab;
+                        bslab->slab_ref++;
+                        break;
+                }
+                i++;
+        }
+        if (slab)
+                goto out_unlock;
+        if (bio_slab_nr == bio_slab_max && entry == -1) {
+                bio_slab_max <<= 1;
+                bio_slabs = krealloc(bio_slabs,
+                                     bio_slab_max * sizeof(struct bio_slab),
+                                     GFP_KERNEL);
+                if (!bio_slabs)
+                        goto out_unlock;
+        }
+        if (entry == -1)
+                entry = bio_slab_nr++;
+        bslab = &bio_slabs[entry];
+        snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
+        slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+        if (!slab)
+                goto out_unlock;
+        printk("bio: create slab <%s> at %d\n", bslab->name, entry);
+        bslab->slab = slab;
+        bslab->slab_ref = 1;
+        bslab->slab_size = sz;
+out_unlock:
+        mutex_unlock(&bio_slab_lock);
+        return slab;
+}
+static void bio_put_slab(struct bio_set *bs)
+{
+        struct bio_slab *bslab = NULL;
+        unsigned int i;
+        mutex_lock(&bio_slab_lock);
+        for (i = 0; i < bio_slab_nr; i++) {
+                if (bs->bio_slab == bio_slabs[i].slab) {
+                        bslab = &bio_slabs[i];
+                        break;
+                }
+        }
+        if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
+                goto out;
+        WARN_ON(!bslab->slab_ref);
+        if (--bslab->slab_ref)
+                goto out;
+        kmem_cache_destroy(bslab->slab);
+        bslab->slab = NULL;
+out:
+        mutex_unlock(&bio_slab_lock);
+}
 unsigned int bvec_nr_vecs(unsigned short idx)
 {
        return bvec_slabs[idx].nr_vecs;
 }
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+{
+        BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
+        if (idx == BIOVEC_MAX_IDX)
+                mempool_free(bv, bs->bvec_pool);
+        else {
+                struct biovec_slab *bvs = bvec_slabs + idx;
+                kmem_cache_free(bvs->slab, bv);
+        }
+}
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
+                              struct bio_set *bs)
 {
        struct bio_vec *bvl;
@@ -67,60 +179,85 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
         * If not, this is a bio_kmalloc() allocation and just do a
         * kzalloc() for the exact number of vecs right away.
         */
-        if (bs) {
+        if (!bs)
+                bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
+        /*
+         * see comment near bvec_array define!
+         */
+        switch (nr) {
+        case 1:
+                *idx = 0;
+                break;
+        case 2 ... 4:
+                *idx = 1;
+                break;
+        case 5 ... 16:
+                *idx = 2;
+                break;
+        case 17 ... 64:
+                *idx = 3;
+                break;
+        case 65 ... 128:
+                *idx = 4;
+                break;
+        case 129 ... BIO_MAX_PAGES:
+                *idx = 5;
+                break;
+        default:
+                return NULL;
+        }
+        /*
+         * idx now points to the pool we want to allocate from. only the
+         * 1-vec entry pool is mempool backed.
+         */
+        if (*idx == BIOVEC_MAX_IDX) {
+fallback:
+                bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+        } else {
+                struct biovec_slab *bvs = bvec_slabs + *idx;
+                gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
                /*
-                 * see comment near bvec_array define!
+                 * Make this allocation restricted and don't dump info on
+                 * allocation failures, since we'll fallback to the mempool
+                 * in case of failure.
                 */
-                switch (nr) {
+                __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
-                case 1:
-                        *idx = 0;
-                        break;
-                case 2 ... 4:
-                        *idx = 1;
-                        break;
-                case 5 ... 16:
-                        *idx = 2;
-                        break;
-                case 17 ... 64:
-                        *idx = 3;
-                        break;
-                case 65 ... 128:
-                        *idx = 4;
-                        break;
-                case 129 ... BIO_MAX_PAGES:
-                        *idx = 5;
-                        break;
-                default:
-                        return NULL;
-                }
                /*
-                 * idx now points to the pool we want to allocate from
+                 * Try a slab allocation. If this fails and __GFP_WAIT
+                 * is set, retry with the 1-entry mempool
                 */
-                bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+                bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
-                if (bvl)
+                if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
-                        memset(bvl, 0,
+                        *idx = BIOVEC_MAX_IDX;
-                                bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+                        goto fallback;
-        } else
+                }
-                bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
+        }
        return bvl;
 }
-void bio_free(struct bio *bio, struct bio_set *bio_set)
+void bio_free(struct bio *bio, struct bio_set *bs)
 {
-        if (bio->bi_io_vec) {
+        void *p;
-                const int pool_idx = BIO_POOL_IDX(bio);
-                BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
-                mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
+        if (bio_has_allocated_vec(bio))
-        }
+                bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
        if (bio_integrity(bio))
-                bio_integrity_free(bio, bio_set);
+                bio_integrity_free(bio, bs);
+        /*
+         * If we have front padding, adjust the bio pointer before freeing
+         */
+        p = bio;
+        if (bs->front_pad)
+                p -= bs->front_pad;
-        mempool_free(bio, bio_set->bio_pool);
+        mempool_free(p, bs->bio_pool);
 }
 /*
@@ -133,7 +270,8 @@ static void bio_fs_destructor(struct bio *bio)
 static void bio_kmalloc_destructor(struct bio *bio)
 {
-        kfree(bio->bi_io_vec);
+        if (bio_has_allocated_vec(bio))
+                kfree(bio->bi_io_vec);
        kfree(bio);
 }
@@ -157,16 +295,20 @@ void bio_init(struct bio *bio)
 *   for a &struct bio to become free. If a %NULL @bs is passed in, we will
 *   fall back to just using @kmalloc to allocate the required memory.
 *
- *   allocate bio and iovecs from the memory pools specified by the
+ *   Note that the caller must set ->bi_destructor on succesful return
- *   bio_set structure, or @kmalloc if none given.
+ *   of a bio, to do the appropriate freeing of the bio once the reference
+ *   count drops to zero.
 **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
-        struct bio *bio;
+        struct bio *bio = NULL;
+        if (bs) {
+                void *p = mempool_alloc(bs->bio_pool, gfp_mask);
-        if (bs)
+                if (p)
-                bio = mempool_alloc(bs->bio_pool, gfp_mask);
+                        bio = p + bs->front_pad;
-        else
+        } else
                bio = kmalloc(sizeof(*bio), gfp_mask);
        if (likely(bio)) {
@@ -176,7 +318,15 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                if (likely(nr_iovecs)) {
                        unsigned long uninitialized_var(idx);
-                        bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+                        if (nr_iovecs <= BIO_INLINE_VECS) {
+                                idx = 0;
+                                bvl = bio->bi_inline_vecs;
+                                nr_iovecs = BIO_INLINE_VECS;
+                        } else {
+                                bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
+                                                        bs);
+                                nr_iovecs = bvec_nr_vecs(idx);
+                        }
                        if (unlikely(!bvl)) {
                                if (bs)
                                        mempool_free(bio, bs->bio_pool);
@@ -186,7 +336,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                                goto out;
                        }
                        bio->bi_flags |= idx << BIO_POOL_OFFSET;
-                        bio->bi_max_vecs = bvec_nr_vecs(idx);
+                        bio->bi_max_vecs = nr_iovecs;
                }
                bio->bi_io_vec = bvl;
        }
@@ -638,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        int i, ret;
        int nr_pages = 0;
        unsigned int len = 0;
+        unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
        for (i = 0; i < iov_count; i++) {
                unsigned long uaddr;
@@ -664,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        bio->bi_rw |= (!write_to_vm << BIO_RW);
        ret = 0;
-        i = 0;
+        if (map_data) {
+                nr_pages = 1 << map_data->page_order;
+                i = map_data->offset / PAGE_SIZE;
+        }
        while (len) {
-                unsigned int bytes;
+                unsigned int bytes = PAGE_SIZE;
-                if (map_data)
+                bytes -= offset;
-                        bytes = 1U << (PAGE_SHIFT + map_data->page_order);
-                else
-                        bytes = PAGE_SIZE;
                if (bytes > len)
                        bytes = len;
                if (map_data) {
-                        if (i == map_data->nr_entries) {
+                        if (i == map_data->nr_entries * nr_pages) {
                                ret = -ENOMEM;
                                break;
                        }
-                        page = map_data->pages[i++];
-                } else
+                        page = map_data->pages[i / nr_pages];
+                        page += (i % nr_pages);
+                        i++;
+                } else {
                        page = alloc_page(q->bounce_gfp | gfp_mask);
-                if (!page) {
+                        if (!page) {
-                        ret = -ENOMEM;
+                                ret = -ENOMEM;
-                        break;
+                                break;
+                        }
                }
-                if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+                if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
                        break;
                len -= bytes;
+                offset = 0;
        }
        if (ret)
@@ -701,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        /*
         * success
         */
-        if (!write_to_vm) {
+        if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
                ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
                if (ret)
                        goto cleanup;
@@ -1346,30 +1504,18 @@ EXPORT_SYMBOL(bio_sector_offset);
 */
 static int biovec_create_pools(struct bio_set *bs, int pool_entries)
 {
-        int i;
+        struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
-        for (i = 0; i < BIOVEC_NR_POOLS; i++) {
+        bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-                struct biovec_slab *bp = bvec_slabs + i;
+        if (!bs->bvec_pool)
-                mempool_t **bvp = bs->bvec_pools + i;
+                return -ENOMEM;
-                *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
-                if (!*bvp)
-                        return -ENOMEM;
-        }
        return 0;
 }
 static void biovec_free_pools(struct bio_set *bs)
 {
-        int i;
+        mempool_destroy(bs->bvec_pool);
-        for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-                mempool_t *bvp = bs->bvec_pools[i];
-                if (bvp)
-                        mempool_destroy(bvp);
-        }
 }
 void bioset_free(struct bio_set *bs)
@@ -1379,25 +1525,49 @@ void bioset_free(struct bio_set *bs)
        bioset_integrity_free(bs);
        biovec_free_pools(bs);
+        bio_put_slab(bs);
        kfree(bs);
 }
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
+/**
+ * bioset_create  - Create a bio_set
+ * @pool_size:  Number of bio and bio_vecs to cache in the mempool
+ * @front_pad:  Number of bytes to allocate in front of the returned bio
+ *
+ * Description:
+ *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
+ *    to ask for a number of bytes to be allocated in front of the bio.
+ *    Front pad allocation is useful for embedding the bio inside
+ *    another structure, to avoid allocating extra data to go with the bio.
+ *    Note that the bio must be embedded at the END of that structure always,
+ *    or things will break badly.
+ */
+struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 {
-        struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
+        unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+        struct bio_set *bs;
+        bs = kzalloc(sizeof(*bs), GFP_KERNEL);
        if (!bs)
                return NULL;
-        bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab);
+        bs->front_pad = front_pad;
+        bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
+        if (!bs->bio_slab) {
+                kfree(bs);
+                return NULL;
+        }
+        bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
        if (!bs->bio_pool)
                goto bad;
-        if (bioset_integrity_create(bs, bio_pool_size))
+        if (bioset_integrity_create(bs, pool_size))
                goto bad;
-        if (!biovec_create_pools(bs, bvec_pool_size))
+        if (!biovec_create_pools(bs, pool_size))
                return bs;
 bad:
@@ -1421,12 +1591,16 @@ static void __init biovec_init_slabs(void)
 static int __init init_bio(void)
 {
-        bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+        bio_slab_max = 2;
+        bio_slab_nr = 0;
+        bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
+        if (!bio_slabs)
+                panic("bio: can't allocate bios\n");
        bio_integrity_init_slab();
        biovec_init_slabs();
-        fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
+        fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
        if (!fs_bio_set)
                panic("bio: can't allocate bios\n");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99e0ae1a4c78..b3c1efff5e1d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -285,6 +285,8 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&bdev->bd_holder_list);
 #endif
        inode_init_once(&ei->vfs_inode);
+        /* Initialize mutex for freeze. */
+        mutex_init(&bdev->bd_fsfreeze_mutex);
 }
 static inline void __bd_forget(struct inode *inode)
@@ -326,12 +328,13 @@ static struct file_system_type bd_type = {
        .kill_sb        = kill_anon_super,
 };
-static struct vfsmount *bd_mnt __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
-struct super_block *blockdev_superblock;
 void __init bdev_cache_init(void)
 {
        int err;
+        struct vfsmount *bd_mnt;
        bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
                        0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_MEM_SPREAD|SLAB_PANIC),
@@ -373,7 +376,7 @@ struct block_device *bdget(dev_t dev)
        struct block_device *bdev;
        struct inode *inode;
-        inode = iget5_locked(bd_mnt->mnt_sb, hash(dev),
+        inode = iget5_locked(blockdev_superblock, hash(dev),
                        bdev_test, bdev_set, &dev);
        if (!inode)
@@ -463,7 +466,7 @@ void bd_forget(struct inode *inode)
        spin_lock(&bdev_lock);
        if (inode->i_bdev) {
-                if (inode->i_sb != blockdev_superblock)
+                if (!sb_is_blkdev_sb(inode->i_sb))
                        bdev = inode->i_bdev;
                __bd_forget(inode);
        }
@@ -1004,6 +1007,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        }
        lock_kernel();
+ restart:
        ret = -ENXIO;
        disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1024,6 +1028,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (disk->fops->open) {
                                ret = disk->fops->open(bdev, mode);
+                                if (ret == -ERESTARTSYS) {
+                                        /* Lost a race with 'disk' being
+                                         * deleted, try again.
+                                         * See md.c
+                                         */
+                                        disk_put_part(bdev->bd_part);
+                                        bdev->bd_part = NULL;
+                                        module_put(disk->fops->owner);
+                                        put_disk(disk);
+                                        bdev->bd_disk = NULL;
+                                        mutex_unlock(&bdev->bd_mutex);
+                                        goto restart;
+                                }
                                if (ret)
                                        goto out_clear;
                        }
@@ -1219,6 +1236,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        return blkdev_ioctl(bdev, mode, cmd, arg);
 }
+/*
+ * Try to release a page associated with block device when the system
+ * is under memory pressure.
+ */
+static int blkdev_releasepage(struct page *page, gfp_t wait)
+{
+        struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
+        if (super && super->s_op->bdev_try_to_free_page)
+                return super->s_op->bdev_try_to_free_page(super, page, wait);
+        return try_to_free_buffers(page);
+}
 static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .writepage      = blkdev_writepage,
@@ -1226,6 +1257,7 @@ static const struct address_space_operations def_blk_aops = {
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
        .writepages     = generic_writepages,
+        .releasepage    = blkdev_releasepage,
        .direct_IO      = blkdev_direct_IO,
 };
@@ -1261,7 +1293,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 /**
 * lookup_bdev  - lookup a struct block_device by name
- * @path:       special file representing the block device
+ * @pathname:   special file representing the block device
 *
 * Get a reference to the blockdevice at @pathname in the current
 * namespace if possible and return it.  Return ERR_PTR(error)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 000000000000..d2cf5a54a4b8
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,25 @@
+ifneq ($(KERNELRELEASE),)
+# kbuild part of makefile
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
+btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+           file-item.o inode-item.o inode-map.o disk-io.o \
+           transaction.o inode.o file.o tree-defrag.o \
+           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+           compression.o
+else
+# Normal Makefile
+KERNELDIR := /lib/modules/`uname -r`/build
+all:
+        $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
+modules_install:
+        $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
+clean:
+        $(MAKE) -C $(KERNELDIR) M=`pwd` clean
+endif
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 000000000000..1d53b62dbba5
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "xattr.h"
+#ifdef CONFIG_FS_POSIX_ACL
+static void btrfs_update_cached_acl(struct inode *inode,
+                                    struct posix_acl **p_acl,
+                                    struct posix_acl *acl)
+{
+        spin_lock(&inode->i_lock);
+        if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
+                posix_acl_release(*p_acl);
+        *p_acl = posix_acl_dup(acl);
+        spin_unlock(&inode->i_lock);
+}
+static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+        int size;
+        const char *name;
+        char *value = NULL;
+        struct posix_acl *acl = NULL, **p_acl;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name = POSIX_ACL_XATTR_ACCESS;
+                p_acl = &BTRFS_I(inode)->i_acl;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name = POSIX_ACL_XATTR_DEFAULT;
+                p_acl = &BTRFS_I(inode)->i_default_acl;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        spin_lock(&inode->i_lock);
+        if (*p_acl != BTRFS_ACL_NOT_CACHED)
+                acl = posix_acl_dup(*p_acl);
+        spin_unlock(&inode->i_lock);
+        if (acl)
+                return acl;
+        size = __btrfs_getxattr(inode, name, "", 0);
+        if (size > 0) {
+                value = kzalloc(size, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                size = __btrfs_getxattr(inode, name, value, size);
+                if (size > 0) {
+                        acl = posix_acl_from_xattr(value, size);
+                        btrfs_update_cached_acl(inode, p_acl, acl);
+                }
+                kfree(value);
+        } else if (size == -ENOENT) {
+                acl = NULL;
+                btrfs_update_cached_acl(inode, p_acl, acl);
+        }
+        return acl;
+}
+static int btrfs_xattr_get_acl(struct inode *inode, int type,
+                               void *value, size_t size)
+{
+        struct posix_acl *acl;
+        int ret = 0;
+        acl = btrfs_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        ret = posix_acl_to_xattr(acl, value, size);
+        posix_acl_release(acl);
+        return ret;
+}
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+        int ret, size = 0;
+        const char *name;
+        struct posix_acl **p_acl;
+        char *value = NULL;
+        mode_t mode;
+        if (acl) {
+                ret = posix_acl_valid(acl);
+                if (ret < 0)
+                        return ret;
+                ret = 0;
+        }
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                mode = inode->i_mode;
+                ret = posix_acl_equiv_mode(acl, &mode);
+                if (ret < 0)
+                        return ret;
+                ret = 0;
+                inode->i_mode = mode;
+                name = POSIX_ACL_XATTR_ACCESS;
+                p_acl = &BTRFS_I(inode)->i_acl;
+                break;
+        case ACL_TYPE_DEFAULT:
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EINVAL : 0;
+                name = POSIX_ACL_XATTR_DEFAULT;
+                p_acl = &BTRFS_I(inode)->i_default_acl;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                size = posix_acl_xattr_size(acl->a_count);
+                value = kmalloc(size, GFP_NOFS);
+                if (!value) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ret = posix_acl_to_xattr(acl, value, size);
+                if (ret < 0)
+                        goto out;
+        }
+        ret = __btrfs_setxattr(inode, name, value, size, 0);
+out:
+        kfree(value);
+        if (!ret)
+                btrfs_update_cached_acl(inode, p_acl, acl);
+        return ret;
+}
+static int btrfs_xattr_set_acl(struct inode *inode, int type,
+                               const void *value, size_t size)
+{
+        int ret = 0;
+        struct posix_acl *acl = NULL;
+        if (value) {
+                acl = posix_acl_from_xattr(value, size);
+                if (acl == NULL) {
+                        value = NULL;
+                        size = 0;
+                } else if (IS_ERR(acl)) {
+                        return PTR_ERR(acl);
+                }
+        }
+        ret = btrfs_set_acl(inode, acl, type);
+        posix_acl_release(acl);
+        return ret;
+}
+static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
+                                      void *value, size_t size)
+{
+        return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
+                                      const void *value, size_t size, int flags)
+{
+        return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
+                                       void *value, size_t size)
+{
+        return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
+                               const void *value, size_t size, int flags)
+{
+        return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl;
+        int error = -EAGAIN;
+        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+        }
+        return error;
+}
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that.  If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+        struct posix_acl *acl = NULL;
+        int ret = 0;
+        /* this happens with subvols */
+        if (!dir)
+                return 0;
+        if (!S_ISLNK(inode->i_mode)) {
+                if (IS_POSIXACL(dir)) {
+                        acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
+                        if (IS_ERR(acl))
+                                return PTR_ERR(acl);
+                }
+                if (!acl)
+                        inode->i_mode &= ~current->fs->umask;
+        }
+        if (IS_POSIXACL(dir) && acl) {
+                struct posix_acl *clone;
+                mode_t mode;
+                if (S_ISDIR(inode->i_mode)) {
+                        ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
+                        if (ret)
+                                goto failed;
+                }
+                clone = posix_acl_clone(acl, GFP_NOFS);
+                ret = -ENOMEM;
+                if (!clone)
+                        goto failed;
+                mode = inode->i_mode;
+                ret = posix_acl_create_masq(clone, &mode);
+                if (ret >= 0) {
+                        inode->i_mode = mode;
+                        if (ret > 0) {
+                                /* we need an acl */
+                                ret = btrfs_set_acl(inode, clone,
+                                                    ACL_TYPE_ACCESS);
+                        }
+                }
+        }
+failed:
+        posix_acl_release(acl);
+        return ret;
+}
+int btrfs_acl_chmod(struct inode *inode)
+{
+        struct posix_acl *acl, *clone;
+        int ret = 0;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!IS_POSIXACL(inode))
+                return 0;
+        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        ret = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!ret)
+                ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
+        posix_acl_release(clone);
+        return ret;
+}
+struct xattr_handler btrfs_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .get    = btrfs_xattr_acl_default_get,
+        .set    = btrfs_xattr_acl_default_set,
+};
+struct xattr_handler btrfs_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .get    = btrfs_xattr_acl_access_get,
+        .set    = btrfs_xattr_acl_access_set,
+};
+#else /* CONFIG_FS_POSIX_ACL */
+int btrfs_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+        return 0;
+}
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+        return 0;
+}
+#endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..8e2fec05dbe0
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/version.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+# include <linux/freezer.h>
+#include "async-thread.h"
+#define WORK_QUEUED_BIT 0
+#define WORK_DONE_BIT 1
+#define WORK_ORDER_DONE_BIT 2
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+        /* pool we belong to */
+        struct btrfs_workers *workers;
+        /* list of struct btrfs_work that are waiting for service */
+        struct list_head pending;
+        /* list of worker threads from struct btrfs_workers */
+        struct list_head worker_list;
+        /* kthread */
+        struct task_struct *task;
+        /* number of things on the pending list */
+        atomic_t num_pending;
+        unsigned long sequence;
+        /* protects the pending list. */
+        spinlock_t lock;
+        /* set to non-zero when this thread is already awake and kicking */
+        int working;
+        /* are we currently idle */
+        int idle;
+};
+/*
+ * helper function to move a thread onto the idle list after it
+ * has finished some requests.
+ */
+static void check_idle_worker(struct btrfs_worker_thread *worker)
+{
+        if (!worker->idle && atomic_read(&worker->num_pending) <
+            worker->workers->idle_thresh / 2) {
+                unsigned long flags;
+                spin_lock_irqsave(&worker->workers->lock, flags);
+                worker->idle = 1;
+                list_move(&worker->worker_list, &worker->workers->idle_list);
+                spin_unlock_irqrestore(&worker->workers->lock, flags);
+        }
+}
+/*
+ * helper function to move a thread off the idle list after new
+ * pending work is added.
+ */
+static void check_busy_worker(struct btrfs_worker_thread *worker)
+{
+        if (worker->idle && atomic_read(&worker->num_pending) >=
+            worker->workers->idle_thresh) {
+                unsigned long flags;
+                spin_lock_irqsave(&worker->workers->lock, flags);
+                worker->idle = 0;
+                list_move_tail(&worker->worker_list,
+                               &worker->workers->worker_list);
+                spin_unlock_irqrestore(&worker->workers->lock, flags);
+        }
+}
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+                                            struct btrfs_work *work)
+{
+        unsigned long flags;
+        if (!workers->ordered)
+                return 0;
+        set_bit(WORK_DONE_BIT, &work->flags);
+        spin_lock_irqsave(&workers->lock, flags);
+        while (!list_empty(&workers->order_list)) {
+                work = list_entry(workers->order_list.next,
+                                  struct btrfs_work, order_list);
+                if (!test_bit(WORK_DONE_BIT, &work->flags))
+                        break;
+                /* we are going to call the ordered done function, but
+                 * we leave the work item on the list as a barrier so
+                 * that later work items that are done don't have their
+                 * functions called before this one returns
+                 */
+                if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+                        break;
+                spin_unlock_irqrestore(&workers->lock, flags);
+                work->ordered_func(work);
+                /* now take the lock again and call the freeing code */
+                spin_lock_irqsave(&workers->lock, flags);
+                list_del(&work->order_list);
+                work->ordered_free(work);
+        }
+        spin_unlock_irqrestore(&workers->lock, flags);
+        return 0;
+}
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+        struct btrfs_worker_thread *worker = arg;
+        struct list_head *cur;
+        struct btrfs_work *work;
+        do {
+                spin_lock_irq(&worker->lock);
+                while (!list_empty(&worker->pending)) {
+                        cur = worker->pending.next;
+                        work = list_entry(cur, struct btrfs_work, list);
+                        list_del(&work->list);
+                        clear_bit(WORK_QUEUED_BIT, &work->flags);
+                        work->worker = worker;
+                        spin_unlock_irq(&worker->lock);
+                        work->func(work);
+                        atomic_dec(&worker->num_pending);
+                        /*
+                         * unless this is an ordered work queue,
+                         * 'work' was probably freed by func above.
+                         */
+                        run_ordered_completions(worker->workers, work);
+                        spin_lock_irq(&worker->lock);
+                        check_idle_worker(worker);
+                }
+                worker->working = 0;
+                if (freezing(current)) {
+                        refrigerator();
+                } else {
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        spin_unlock_irq(&worker->lock);
+                        if (!kthread_should_stop())
+                                schedule();
+                        __set_current_state(TASK_RUNNING);
+                }
+        } while (!kthread_should_stop());
+        return 0;
+}
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+        struct list_head *cur;
+        struct btrfs_worker_thread *worker;
+        list_splice_init(&workers->idle_list, &workers->worker_list);
+        while (!list_empty(&workers->worker_list)) {
+                cur = workers->worker_list.next;
+                worker = list_entry(cur, struct btrfs_worker_thread,
+                                    worker_list);
+                kthread_stop(worker->task);
+                list_del(&worker->worker_list);
+                kfree(worker);
+        }
+        return 0;
+}
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+{
+        workers->num_workers = 0;
+        INIT_LIST_HEAD(&workers->worker_list);
+        INIT_LIST_HEAD(&workers->idle_list);
+        INIT_LIST_HEAD(&workers->order_list);
+        spin_lock_init(&workers->lock);
+        workers->max_workers = max;
+        workers->idle_thresh = 32;
+        workers->name = name;
+        workers->ordered = 0;
+}
+/*
+ * starts new worker threads.  This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+        struct btrfs_worker_thread *worker;
+        int ret = 0;
+        int i;
+        for (i = 0; i < num_workers; i++) {
+                worker = kzalloc(sizeof(*worker), GFP_NOFS);
+                if (!worker) {
+                        ret = -ENOMEM;
+                        goto fail;
+                }
+                INIT_LIST_HEAD(&worker->pending);
+                INIT_LIST_HEAD(&worker->worker_list);
+                spin_lock_init(&worker->lock);
+                atomic_set(&worker->num_pending, 0);
+                worker->task = kthread_run(worker_loop, worker,
+                                           "btrfs-%s-%d", workers->name,
+                                           workers->num_workers + i);
+                worker->workers = workers;
+                if (IS_ERR(worker->task)) {
+                        kfree(worker);
+                        ret = PTR_ERR(worker->task);
+                        goto fail;
+                }
+                spin_lock_irq(&workers->lock);
+                list_add_tail(&worker->worker_list, &workers->idle_list);
+                worker->idle = 1;
+                workers->num_workers++;
+                spin_unlock_irq(&workers->lock);
+        }
+        return 0;
+fail:
+        btrfs_stop_workers(workers);
+        return ret;
+}
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now.  This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+        struct btrfs_worker_thread *worker;
+        struct list_head *next;
+        int enforce_min = workers->num_workers < workers->max_workers;
+        /*
+         * if we find an idle thread, don't move it to the end of the
+         * idle list.  This improves the chance that the next submission
+         * will reuse the same thread, and maybe catch it while it is still
+         * working
+         */
+        if (!list_empty(&workers->idle_list)) {
+                next = workers->idle_list.next;
+                worker = list_entry(next, struct btrfs_worker_thread,
+                                    worker_list);
+                return worker;
+        }
+        if (enforce_min || list_empty(&workers->worker_list))
+                return NULL;
+        /*
+         * if we pick a busy task, move the task to the end of the list.
+         * hopefully this will keep things somewhat evenly balanced.
+         * Do the move in batches based on the sequence number.  This groups
+         * requests submitted at roughly the same time onto the same worker.
+         */
+        next = workers->worker_list.next;
+        worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+        atomic_inc(&worker->num_pending);
+        worker->sequence++;
+        if (worker->sequence % workers->idle_thresh == 0)
+                list_move_tail(next, &workers->worker_list);
+        return worker;
+}
+/*
+ * selects a worker thread to take the next job.  This will either find
+ * an idle worker, start a new worker up to the max count, or just return
+ * one of the existing busy workers.
+ */
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+        struct btrfs_worker_thread *worker;
+        unsigned long flags;
+again:
+        spin_lock_irqsave(&workers->lock, flags);
+        worker = next_worker(workers);
+        spin_unlock_irqrestore(&workers->lock, flags);
+        if (!worker) {
+                spin_lock_irqsave(&workers->lock, flags);
+                if (workers->num_workers >= workers->max_workers) {
+                        struct list_head *fallback = NULL;
+                        /*
+                         * we have failed to find any workers, just
+                         * return the force one
+                         */
+                        if (!list_empty(&workers->worker_list))
+                                fallback = workers->worker_list.next;
+                        if (!list_empty(&workers->idle_list))
+                                fallback = workers->idle_list.next;
+                        BUG_ON(!fallback);
+                        worker = list_entry(fallback,
+                                  struct btrfs_worker_thread, worker_list);
+                        spin_unlock_irqrestore(&workers->lock, flags);
+                } else {
+                        spin_unlock_irqrestore(&workers->lock, flags);
+                        /* we're below the limit, start another worker */
+                        btrfs_start_workers(workers, 1);
+                        goto again;
+                }
+        }
+        return worker;
+}
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from.  It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+        struct btrfs_worker_thread *worker = work->worker;
+        unsigned long flags;
+        if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+                goto out;
+        spin_lock_irqsave(&worker->lock, flags);
+        atomic_inc(&worker->num_pending);
+        list_add_tail(&work->list, &worker->pending);
+        /* by definition we're busy, take ourselves off the idle
+         * list
+         */
+        if (worker->idle) {
+                spin_lock_irqsave(&worker->workers->lock, flags);
+                worker->idle = 0;
+                list_move_tail(&worker->worker_list,
+                               &worker->workers->worker_list);
+                spin_unlock_irqrestore(&worker->workers->lock, flags);
+        }
+        spin_unlock_irqrestore(&worker->lock, flags);
+out:
+        return 0;
+}
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+        struct btrfs_worker_thread *worker;
+        unsigned long flags;
+        int wake = 0;
+        /* don't requeue something already on a list */
+        if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+                goto out;
+        worker = find_worker(workers);
+        if (workers->ordered) {
+                spin_lock_irqsave(&workers->lock, flags);
+                list_add_tail(&work->order_list, &workers->order_list);
+                spin_unlock_irqrestore(&workers->lock, flags);
+        } else {
+                INIT_LIST_HEAD(&work->order_list);
+        }
+        spin_lock_irqsave(&worker->lock, flags);
+        atomic_inc(&worker->num_pending);
+        check_busy_worker(worker);
+        list_add_tail(&work->list, &worker->pending);
+        /*
+         * avoid calling into wake_up_process if this thread has already
+         * been kicked
+         */
+        if (!worker->working)
+                wake = 1;
+        worker->working = 1;
+        spin_unlock_irqrestore(&worker->lock, flags);
+        if (wake)
+                wake_up_process(worker->task);
+out:
+        return 0;
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..31be4ed8b63e
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+struct btrfs_worker_thread;
+/*
+ * This is similar to a workqueue, but it is meant to spread the operations
+ * across all available cpus instead of just the CPU that was used to
+ * queue the work.  There is also some batching introduced to try and
+ * cut down on context switches.
+ *
+ * By default threads are added on demand up to 2 * the number of cpus.
+ * Changing struct btrfs_workers->max_workers is one way to prevent
+ * demand creation of kthreads.
+ *
+ * the basic model of these worker threads is to embed a btrfs_work
+ * structure in your own data struct, and use container_of in a
+ * work function to get back to your data struct.
+ */
+struct btrfs_work {
+        /*
+         * func should be set to the function you want called
+         * your work struct is passed as the only arg
+         *
+         * ordered_func must be set for work sent to an ordered work queue,
+         * and it is called to complete a given work item in the same
+         * order they were sent to the queue.
+         */
+        void (*func)(struct btrfs_work *work);
+        void (*ordered_func)(struct btrfs_work *work);
+        void (*ordered_free)(struct btrfs_work *work);
+        /*
+         * flags should be set to zero.  It is used to make sure the
+         * struct is only inserted once into the list.
+         */
+        unsigned long flags;
+        /* don't touch these */
+        struct btrfs_worker_thread *worker;
+        struct list_head list;
+        struct list_head order_list;
+};
+struct btrfs_workers {
+        /* current number of running workers */
+        int num_workers;
+        /* max number of workers allowed.  changed by btrfs_start_workers */
+        int max_workers;
+        /* once a worker has this many requests or fewer, it is idle */
+        int idle_thresh;
+        /* force completions in the order they were queued */
+        int ordered;
+        /* list with all the work threads.  The workers on the idle thread
+         * may be actively servicing jobs, but they haven't yet hit the
+         * idle thresh limit above.
+         */
+        struct list_head worker_list;
+        struct list_head idle_list;
+        /*
+         * when operating in ordered mode, this maintains the list
+         * of work items waiting for completion
+         */
+        struct list_head order_list;
+        /* lock for finding the next worker thread to queue on */
+        spinlock_t lock;
+        /* extra name for this worker, used for current->name */
+        char *name;
+};
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+int btrfs_requeue_work(struct btrfs_work *work);
+#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 000000000000..a8c9693b75ac
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+#include "extent_map.h"
+#include "extent_io.h"
+#include "ordered-data.h"
+/* in memory btrfs inode */
+struct btrfs_inode {
+        /* which subvolume this inode belongs to */
+        struct btrfs_root *root;
+        /* key used to find this inode on disk.  This is used by the code
+         * to read in roots of subvolumes
+         */
+        struct btrfs_key location;
+        /* the extent_tree has caches of all the extent mappings to disk */
+        struct extent_map_tree extent_tree;
+        /* the io_tree does range state (DIRTY, LOCKED etc) */
+        struct extent_io_tree io_tree;
+        /* special utility tree used to record which mirrors have already been
+         * tried when checksums fail for a given block
+         */
+        struct extent_io_tree io_failure_tree;
+        /* held while inesrting or deleting extents from files */
+        struct mutex extent_mutex;
+        /* held while logging the inode in tree-log.c */
+        struct mutex log_mutex;
+        /* used to order data wrt metadata */
+        struct btrfs_ordered_inode_tree ordered_tree;
+        /* standard acl pointers */
+        struct posix_acl *i_acl;
+        struct posix_acl *i_default_acl;
+        /* for keeping track of orphaned inodes */
+        struct list_head i_orphan;
+        /* list of all the delalloc inodes in the FS.  There are times we need
+         * to write all the delalloc pages to disk, and this list is used
+         * to walk them all.
+         */
+        struct list_head delalloc_inodes;
+        /* full 64 bit generation number, struct vfs_inode doesn't have a big
+         * enough field for this.
+         */
+        u64 generation;
+        /* sequence number for NFS changes */
+        u64 sequence;
+        /*
+         * transid of the trans_handle that last modified this inode
+         */
+        u64 last_trans;
+        /*
+         * transid that last logged this inode
+         */
+        u64 logged_trans;
+        /*
+         * trans that last made a change that should be fully fsync'd.  This
+         * gets reset to zero each time the inode is logged
+         */
+        u64 log_dirty_trans;
+        /* total number of bytes pending delalloc, used by stat to calc the
+         * real block usage of the file
+         */
+        u64 delalloc_bytes;
+        /*
+         * the size of the file stored in the metadata on disk.  data=ordered
+         * means the in-memory i_size might be larger than the size on disk
+         * because not all the blocks are written yet.
+         */
+        u64 disk_i_size;
+        /* flags field from the on disk inode */
+        u32 flags;
+        /*
+         * if this is a directory then index_cnt is the counter for the index
+         * number for new files that are created
+         */
+        u64 index_cnt;
+        /* the start of block group preferred for allocations. */
+        u64 block_group;
+        struct inode vfs_inode;
+};
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+        return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+        inode->i_size = size;
+        BTRFS_I(inode)->disk_i_size = size;
+}
+#endif
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 000000000000..7c4503ef6efd
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,7 @@
+#ifndef _COMPAT_H_
+#define _COMPAT_H_
+#define btrfs_drop_nlink(inode) drop_nlink(inode)
+#define btrfs_inc_nlink(inode)  inc_nlink(inode)
+#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..ee848d8585d9
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include <linux/pagevec.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+struct compressed_bio {
+        /* number of bios pending for this compressed extent */
+        atomic_t pending_bios;
+        /* the pages with the compressed data on them */
+        struct page **compressed_pages;
+        /* inode that owns this data */
+        struct inode *inode;
+        /* starting offset in the inode for our pages */
+        u64 start;
+        /* number of bytes in the inode we're working on */
+        unsigned long len;
+        /* number of bytes on disk */
+        unsigned long compressed_len;
+        /* number of compressed pages in the array */
+        unsigned long nr_pages;
+        /* IO errors */
+        int errors;
+        int mirror_num;
+        /* for reads, this is the bio we are copying the data into */
+        struct bio *orig_bio;
+        /*
+         * the start of a variable length array of checksums only
+         * used by reads
+         */
+        u32 sums;
+};
+static inline int compressed_bio_size(struct btrfs_root *root,
+                                      unsigned long disk_size)
+{
+        u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+        return sizeof(struct compressed_bio) +
+                ((disk_size + root->sectorsize - 1) / root->sectorsize) *
+                csum_size;
+}
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+                                        u64 first_byte, gfp_t gfp_flags)
+{
+        struct bio *bio;
+        int nr_vecs;
+        nr_vecs = bio_get_nr_vecs(bdev);
+        bio = bio_alloc(gfp_flags, nr_vecs);
+        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+                while (!bio && (nr_vecs /= 2))
+                        bio = bio_alloc(gfp_flags, nr_vecs);
+        }
+        if (bio) {
+                bio->bi_size = 0;
+                bio->bi_bdev = bdev;
+                bio->bi_sector = first_byte >> 9;
+        }
+        return bio;
+}
+static int check_compressed_csum(struct inode *inode,
+                                 struct compressed_bio *cb,
+                                 u64 disk_start)
+{
+        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct page *page;
+        unsigned long i;
+        char *kaddr;
+        u32 csum;
+        u32 *cb_sum = &cb->sums;
+        if (btrfs_test_flag(inode, NODATASUM))
+                return 0;
+        for (i = 0; i < cb->nr_pages; i++) {
+                page = cb->compressed_pages[i];
+                csum = ~(u32)0;
+                kaddr = kmap_atomic(page, KM_USER0);
+                csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+                btrfs_csum_final(csum, (char *)&csum);
+                kunmap_atomic(kaddr, KM_USER0);
+                if (csum != *cb_sum) {
+                        printk(KERN_INFO "btrfs csum failed ino %lu "
+                               "extent %llu csum %u "
+                               "wanted %u mirror %d\n", inode->i_ino,
+                               (unsigned long long)disk_start,
+                               csum, *cb_sum, cb->mirror_num);
+                        ret = -EIO;
+                        goto fail;
+                }
+                cb_sum++;
+        }
+        ret = 0;
+fail:
+        return ret;
+}
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+        struct extent_io_tree *tree;
+        struct compressed_bio *cb = bio->bi_private;
+        struct inode *inode;
+        struct page *page;
+        unsigned long index;
+        int ret;
+        if (err)
+                cb->errors = 1;
+        /* if there are more bios still pending for this compressed
+         * extent, just exit
+         */
+        if (!atomic_dec_and_test(&cb->pending_bios))
+                goto out;
+        inode = cb->inode;
+        ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+        if (ret)
+                goto csum_failed;
+        /* ok, we're the last bio for this extent, lets start
+         * the decompression.
+         */
+        tree = &BTRFS_I(inode)->io_tree;
+        ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+                                        cb->start,
+                                        cb->orig_bio->bi_io_vec,
+                                        cb->orig_bio->bi_vcnt,
+                                        cb->compressed_len);
+csum_failed:
+        if (ret)
+                cb->errors = 1;
+        /* release the compressed pages */
+        index = 0;
+        for (index = 0; index < cb->nr_pages; index++) {
+                page = cb->compressed_pages[index];
+                page->mapping = NULL;
+                page_cache_release(page);
+        }
+        /* do io completion on the original bio */
+        if (cb->errors) {
+                bio_io_error(cb->orig_bio);
+        } else {
+                int bio_index = 0;
+                struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+                /*
+                 * we have verified the checksum already, set page
+                 * checked so the end_io handlers know about it
+                 */
+                while (bio_index < cb->orig_bio->bi_vcnt) {
+                        SetPageChecked(bvec->bv_page);
+                        bvec++;
+                        bio_index++;
+                }
+                bio_endio(cb->orig_bio, 0);
+        }
+        /* finally free the cb struct */
+        kfree(cb->compressed_pages);
+        kfree(cb);
+out:
+        bio_put(bio);
+}
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+                                             unsigned long ram_size)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+        struct page *pages[16];
+        unsigned long nr_pages = end_index - index + 1;
+        int i;
+        int ret;
+        while (nr_pages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min_t(unsigned long,
+                                     nr_pages, ARRAY_SIZE(pages)), pages);
+                if (ret == 0) {
+                        nr_pages -= 1;
+                        index += 1;
+                        continue;
+                }
+                for (i = 0; i < ret; i++) {
+                        end_page_writeback(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                nr_pages -= ret;
+                index += ret;
+        }
+        /* the inode may be gone now */
+        return 0;
+}
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+        struct extent_io_tree *tree;
+        struct compressed_bio *cb = bio->bi_private;
+        struct inode *inode;
+        struct page *page;
+        unsigned long index;
+        if (err)
+                cb->errors = 1;
+        /* if there are more bios still pending for this compressed
+         * extent, just exit
+         */
+        if (!atomic_dec_and_test(&cb->pending_bios))
+                goto out;
+        /* ok, we're the last bio for this extent, step one is to
+         * call back into the FS and do all the end_io operations
+         */
+        inode = cb->inode;
+        tree = &BTRFS_I(inode)->io_tree;
+        cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+        tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+                                         cb->start,
+                                         cb->start + cb->len - 1,
+                                         NULL, 1);
+        cb->compressed_pages[0]->mapping = NULL;
+        end_compressed_writeback(inode, cb->start, cb->len);
+        /* note, our inode could be gone now */
+        /*
+         * release the compressed pages, these came from alloc_page and
+         * are not attached to the inode at all
+         */
+        index = 0;
+        for (index = 0; index < cb->nr_pages; index++) {
+                page = cb->compressed_pages[index];
+                page->mapping = NULL;
+                page_cache_release(page);
+        }
+        /* finally free the cb struct */
+        kfree(cb->compressed_pages);
+        kfree(cb);
+out:
+        bio_put(bio);
+}
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                 unsigned long len, u64 disk_start,
+                                 unsigned long compressed_len,
+                                 struct page **compressed_pages,
+                                 unsigned long nr_pages)
+{
+        struct bio *bio = NULL;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct compressed_bio *cb;
+        unsigned long bytes_left;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int page_index = 0;
+        struct page *page;
+        u64 first_byte = disk_start;
+        struct block_device *bdev;
+        int ret;
+        WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+        atomic_set(&cb->pending_bios, 0);
+        cb->errors = 0;
+        cb->inode = inode;
+        cb->start = start;
+        cb->len = len;
+        cb->mirror_num = 0;
+        cb->compressed_pages = compressed_pages;
+        cb->compressed_len = compressed_len;
+        cb->orig_bio = NULL;
+        cb->nr_pages = nr_pages;
+        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+        bio->bi_private = cb;
+        bio->bi_end_io = end_compressed_bio_write;
+        atomic_inc(&cb->pending_bios);
+        /* create and submit bios for the compressed pages */
+        bytes_left = compressed_len;
+        for (page_index = 0; page_index < cb->nr_pages; page_index++) {
+                page = compressed_pages[page_index];
+                page->mapping = inode->i_mapping;
+                if (bio->bi_size)
+                        ret = io_tree->ops->merge_bio_hook(page, 0,
+                                                           PAGE_CACHE_SIZE,
+                                                           bio, 0);
+                else
+                        ret = 0;
+                page->mapping = NULL;
+                if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+                    PAGE_CACHE_SIZE) {
+                        bio_get(bio);
+                        /*
+                         * inc the count before we submit the bio so
+                         * we know the end IO handler won't happen before
+                         * we inc the count.  Otherwise, the cb might get
+                         * freed before we're done setting it up
+                         */
+                        atomic_inc(&cb->pending_bios);
+                        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+                        BUG_ON(ret);
+                        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+                        BUG_ON(ret);
+                        bio_put(bio);
+                        bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+                        bio->bi_private = cb;
+                        bio->bi_end_io = end_compressed_bio_write;
+                        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+                }
+                if (bytes_left < PAGE_CACHE_SIZE) {
+                        printk("bytes left %lu compress len %lu nr %lu\n",
+                               bytes_left, cb->compressed_len, cb->nr_pages);
+                }
+                bytes_left -= PAGE_CACHE_SIZE;
+                first_byte += PAGE_CACHE_SIZE;
+                cond_resched();
+        }
+        bio_get(bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        BUG_ON(ret);
+        ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+        BUG_ON(ret);
+        ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+        BUG_ON(ret);
+        bio_put(bio);
+        return 0;
+}
+static noinline int add_ra_bio_pages(struct inode *inode,
+                                     u64 compressed_end,
+                                     struct compressed_bio *cb)
+{
+        unsigned long end_index;
+        unsigned long page_index;
+        u64 last_offset;
+        u64 isize = i_size_read(inode);
+        int ret;
+        struct page *page;
+        unsigned long nr_pages = 0;
+        struct extent_map *em;
+        struct address_space *mapping = inode->i_mapping;
+        struct pagevec pvec;
+        struct extent_map_tree *em_tree;
+        struct extent_io_tree *tree;
+        u64 end;
+        int misses = 0;
+        page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+        last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+        em_tree = &BTRFS_I(inode)->extent_tree;
+        tree = &BTRFS_I(inode)->io_tree;
+        if (isize == 0)
+                return 0;
+        end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+        pagevec_init(&pvec, 0);
+        while (last_offset < compressed_end) {
+                page_index = last_offset >> PAGE_CACHE_SHIFT;
+                if (page_index > end_index)
+                        break;
+                rcu_read_lock();
+                page = radix_tree_lookup(&mapping->page_tree, page_index);
+                rcu_read_unlock();
+                if (page) {
+                        misses++;
+                        if (misses > 4)
+                                break;
+                        goto next;
+                }
+                page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+                if (!page)
+                        break;
+                page->index = page_index;
+                /*
+                 * what we want to do here is call add_to_page_cache_lru,
+                 * but that isn't exported, so we reproduce it here
+                 */
+                if (add_to_page_cache(page, mapping,
+                                      page->index, GFP_NOFS)) {
+                        page_cache_release(page);
+                        goto next;
+                }
+                /* open coding of lru_cache_add, also not exported */
+                page_cache_get(page);
+                if (!pagevec_add(&pvec, page))
+                        __pagevec_lru_add_file(&pvec);
+                end = last_offset + PAGE_CACHE_SIZE - 1;
+                /*
+                 * at this point, we have a locked page in the page cache
+                 * for these bytes in the file.  But, we have to make
+                 * sure they map to this compressed extent on disk.
+                 */
+                set_page_extent_mapped(page);
+                lock_extent(tree, last_offset, end, GFP_NOFS);
+                spin_lock(&em_tree->lock);
+                em = lookup_extent_mapping(em_tree, last_offset,
+                                           PAGE_CACHE_SIZE);
+                spin_unlock(&em_tree->lock);
+                if (!em || last_offset < em->start ||
+                    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+                    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+                        free_extent_map(em);
+                        unlock_extent(tree, last_offset, end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        break;
+                }
+                free_extent_map(em);
+                if (page->index == end_index) {
+                        char *userpage;
+                        size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+                        if (zero_offset) {
+                                int zeros;
+                                zeros = PAGE_CACHE_SIZE - zero_offset;
+                                userpage = kmap_atomic(page, KM_USER0);
+                                memset(userpage + zero_offset, 0, zeros);
+                                flush_dcache_page(page);
+                                kunmap_atomic(userpage, KM_USER0);
+                        }
+                }
+                ret = bio_add_page(cb->orig_bio, page,
+                                   PAGE_CACHE_SIZE, 0);
+                if (ret == PAGE_CACHE_SIZE) {
+                        nr_pages++;
+                        page_cache_release(page);
+                } else {
+                        unlock_extent(tree, last_offset, end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        break;
+                }
+next:
+                last_offset += PAGE_CACHE_SIZE;
+        }
+        if (pagevec_count(&pvec))
+                __pagevec_lru_add_file(&pvec);
+        return 0;
+}
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags)
+{
+        struct extent_io_tree *tree;
+        struct extent_map_tree *em_tree;
+        struct compressed_bio *cb;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+        unsigned long compressed_len;
+        unsigned long nr_pages;
+        unsigned long page_index;
+        struct page *page;
+        struct block_device *bdev;
+        struct bio *comp_bio;
+        u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+        u64 em_len;
+        u64 em_start;
+        struct extent_map *em;
+        int ret;
+        u32 *sums;
+        tree = &BTRFS_I(inode)->io_tree;
+        em_tree = &BTRFS_I(inode)->extent_tree;
+        /* we need the actual starting offset of this extent in the file */
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree,
+                                   page_offset(bio->bi_io_vec->bv_page),
+                                   PAGE_CACHE_SIZE);
+        spin_unlock(&em_tree->lock);
+        compressed_len = em->block_len;
+        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+        atomic_set(&cb->pending_bios, 0);
+        cb->errors = 0;
+        cb->inode = inode;
+        cb->mirror_num = mirror_num;
+        sums = &cb->sums;
+        cb->start = em->orig_start;
+        em_len = em->len;
+        em_start = em->start;
+        free_extent_map(em);
+        em = NULL;
+        cb->len = uncompressed_len;
+        cb->compressed_len = compressed_len;
+        cb->orig_bio = bio;
+        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+                                 PAGE_CACHE_SIZE;
+        cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+                                       GFP_NOFS);
+        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        for (page_index = 0; page_index < nr_pages; page_index++) {
+                cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+                                                              __GFP_HIGHMEM);
+        }
+        cb->nr_pages = nr_pages;
+        add_ra_bio_pages(inode, em_start + em_len, cb);
+        /* include any pages we added in add_ra-bio_pages */
+        uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+        cb->len = uncompressed_len;
+        comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+        comp_bio->bi_private = cb;
+        comp_bio->bi_end_io = end_compressed_bio_read;
+        atomic_inc(&cb->pending_bios);
+        for (page_index = 0; page_index < nr_pages; page_index++) {
+                page = cb->compressed_pages[page_index];
+                page->mapping = inode->i_mapping;
+                page->index = em_start >> PAGE_CACHE_SHIFT;
+                if (comp_bio->bi_size)
+                        ret = tree->ops->merge_bio_hook(page, 0,
+                                                        PAGE_CACHE_SIZE,
+                                                        comp_bio, 0);
+                else
+                        ret = 0;
+                page->mapping = NULL;
+                if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+                    PAGE_CACHE_SIZE) {
+                        bio_get(comp_bio);
+                        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+                        BUG_ON(ret);
+                        /*
+                         * inc the count before we submit the bio so
+                         * we know the end IO handler won't happen before
+                         * we inc the count.  Otherwise, the cb might get
+                         * freed before we're done setting it up
+                         */
+                        atomic_inc(&cb->pending_bios);
+                        if (!btrfs_test_flag(inode, NODATASUM)) {
+                                btrfs_lookup_bio_sums(root, inode, comp_bio,
+                                                      sums);
+                        }
+                        sums += (comp_bio->bi_size + root->sectorsize - 1) /
+                                root->sectorsize;
+                        ret = btrfs_map_bio(root, READ, comp_bio,
+                                            mirror_num, 0);
+                        BUG_ON(ret);
+                        bio_put(comp_bio);
+                        comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+                                                        GFP_NOFS);
+                        comp_bio->bi_private = cb;
+                        comp_bio->bi_end_io = end_compressed_bio_read;
+                        bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+                }
+                cur_disk_byte += PAGE_CACHE_SIZE;
+        }
+        bio_get(comp_bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+        BUG_ON(ret);
+        if (!btrfs_test_flag(inode, NODATASUM))
+                btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
+        BUG_ON(ret);
+        bio_put(comp_bio);
+        return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+int btrfs_zlib_decompress(unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                              u64 disk_start,
+                              struct bio_vec *bvec,
+                              int vcnt,
+                              size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+                                  unsigned long len, u64 disk_start,
+                                  unsigned long compressed_len,
+                                  struct page **compressed_pages,
+                                  unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 000000000000..6e1b3de36700
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_CRC32C__
+#define __BTRFS_CRC32C__
+#include <linux/crc32c.h>
+/*
+ * this file used to do more for selecting the HW version of crc32c,
+ * perhaps it will one day again soon.
+ */
+#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 000000000000..9e46c0776816
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,3953 @@
+/*
+ * Copyright (C) 2007,2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "locking.h"
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *ins_key,
+                      struct btrfs_path *path, int data_size, int extend);
+static int push_node_left(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct extent_buffer *dst,
+                          struct extent_buffer *src, int empty);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct extent_buffer *dst_buf,
+                              struct extent_buffer *src_buf);
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_path *path, int level, int slot);
+inline void btrfs_init_path(struct btrfs_path *p)
+{
+        memset(p, 0, sizeof(*p));
+}
+struct btrfs_path *btrfs_alloc_path(void)
+{
+        struct btrfs_path *path;
+        path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
+        if (path) {
+                btrfs_init_path(path);
+                path->reada = 1;
+        }
+        return path;
+}
+/* this also releases the path */
+void btrfs_free_path(struct btrfs_path *p)
+{
+        btrfs_release_path(NULL, p);
+        kmem_cache_free(btrfs_path_cachep, p);
+}
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
+noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+{
+        int i;
+        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+                p->slots[i] = 0;
+                if (!p->nodes[i])
+                        continue;
+                if (p->locks[i]) {
+                        btrfs_tree_unlock(p->nodes[i]);
+                        p->locks[i] = 0;
+                }
+                free_extent_buffer(p->nodes[i]);
+                p->nodes[i] = NULL;
+        }
+}
+/*
+ * safely gets a reference on the root node of a tree.  A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree.  See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear.  It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+        struct extent_buffer *eb;
+        spin_lock(&root->node_lock);
+        eb = root->node;
+        extent_buffer_get(eb);
+        spin_unlock(&root->node_lock);
+        return eb;
+}
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+        struct extent_buffer *eb;
+        while (1) {
+                eb = btrfs_root_node(root);
+                btrfs_tree_lock(eb);
+                spin_lock(&root->node_lock);
+                if (eb == root->node) {
+                        spin_unlock(&root->node_lock);
+                        break;
+                }
+                spin_unlock(&root->node_lock);
+                btrfs_tree_unlock(eb);
+                free_extent_buffer(eb);
+        }
+        return eb;
+}
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list.  transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+        if (root->track_dirty && list_empty(&root->dirty_list)) {
+                list_add(&root->dirty_list,
+                         &root->fs_info->dirty_cowonly_roots);
+        }
+}
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid.  The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      struct extent_buffer *buf,
+                      struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+        struct extent_buffer *cow;
+        u32 nritems;
+        int ret = 0;
+        int level;
+        struct btrfs_root *new_root;
+        new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
+        if (!new_root)
+                return -ENOMEM;
+        memcpy(new_root, root, sizeof(*new_root));
+        new_root->root_key.objectid = new_root_objectid;
+        WARN_ON(root->ref_cows && trans->transid !=
+                root->fs_info->running_transaction->transid);
+        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+        level = btrfs_header_level(buf);
+        nritems = btrfs_header_nritems(buf);
+        cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+                                     new_root_objectid, trans->transid,
+                                     level, buf->start, 0);
+        if (IS_ERR(cow)) {
+                kfree(new_root);
+                return PTR_ERR(cow);
+        }
+        copy_extent_buffer(cow, buf, 0, 0, cow->len);
+        btrfs_set_header_bytenr(cow, cow->start);
+        btrfs_set_header_generation(cow, trans->transid);
+        btrfs_set_header_owner(cow, new_root_objectid);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        write_extent_buffer(cow, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(cow),
+                            BTRFS_FSID_SIZE);
+        WARN_ON(btrfs_header_generation(buf) > trans->transid);
+        ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
+        kfree(new_root);
+        if (ret)
+                return ret;
+        btrfs_mark_buffer_dirty(cow);
+        *cow_ret = cow;
+        return 0;
+}
+/*
+ * does the dirty work in cow of a single block.  The parent block (if
+ * supplied) is updated to point to the new cow copy.  The new buffer is marked
+ * dirty and returned locked.  If you modify the block it needs to be marked
+ * dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
+ *
+ * prealloc_dest -- if you have already reserved a destination for the cow,
+ * this uses that block instead of allocating a new one.
+ * btrfs_alloc_reserved_extent is used to finish the allocation.
+ */
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct extent_buffer *buf,
+                             struct extent_buffer *parent, int parent_slot,
+                             struct extent_buffer **cow_ret,
+                             u64 search_start, u64 empty_size,
+                             u64 prealloc_dest)
+{
+        u64 parent_start;
+        struct extent_buffer *cow;
+        u32 nritems;
+        int ret = 0;
+        int level;
+        int unlock_orig = 0;
+        if (*cow_ret == buf)
+                unlock_orig = 1;
+        WARN_ON(!btrfs_tree_locked(buf));
+        if (parent)
+                parent_start = parent->start;
+        else
+                parent_start = 0;
+        WARN_ON(root->ref_cows && trans->transid !=
+                root->fs_info->running_transaction->transid);
+        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+        level = btrfs_header_level(buf);
+        nritems = btrfs_header_nritems(buf);
+        if (prealloc_dest) {
+                struct btrfs_key ins;
+                ins.objectid = prealloc_dest;
+                ins.offset = buf->len;
+                ins.type = BTRFS_EXTENT_ITEM_KEY;
+                ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
+                                                  root->root_key.objectid,
+                                                  trans->transid, level, &ins);
+                BUG_ON(ret);
+                cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
+                                            buf->len);
+        } else {
+                cow = btrfs_alloc_free_block(trans, root, buf->len,
+                                             parent_start,
+                                             root->root_key.objectid,
+                                             trans->transid, level,
+                                             search_start, empty_size);
+        }
+        if (IS_ERR(cow))
+                return PTR_ERR(cow);
+        copy_extent_buffer(cow, buf, 0, 0, cow->len);
+        btrfs_set_header_bytenr(cow, cow->start);
+        btrfs_set_header_generation(cow, trans->transid);
+        btrfs_set_header_owner(cow, root->root_key.objectid);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        write_extent_buffer(cow, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(cow),
+                            BTRFS_FSID_SIZE);
+        WARN_ON(btrfs_header_generation(buf) > trans->transid);
+        if (btrfs_header_generation(buf) != trans->transid) {
+                u32 nr_extents;
+                ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
+                if (ret)
+                        return ret;
+                ret = btrfs_cache_ref(trans, root, buf, nr_extents);
+                WARN_ON(ret);
+        } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
+                /*
+                 * There are only two places that can drop reference to
+                 * tree blocks owned by living reloc trees, one is here,
+                 * the other place is btrfs_drop_subtree. In both places,
+                 * we check reference count while tree block is locked.
+                 * Furthermore, if reference count is one, it won't get
+                 * increased by someone else.
+                 */
+                u32 refs;
+                ret = btrfs_lookup_extent_ref(trans, root, buf->start,
+                                              buf->len, &refs);
+                BUG_ON(ret);
+                if (refs == 1) {
+                        ret = btrfs_update_ref(trans, root, buf, cow,
+                                               0, nritems);
+                        clean_tree_block(trans, root, buf);
+                } else {
+                        ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
+                }
+                BUG_ON(ret);
+        } else {
+                ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
+                if (ret)
+                        return ret;
+                clean_tree_block(trans, root, buf);
+        }
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
+                WARN_ON(ret);
+        }
+        if (buf == root->node) {
+                WARN_ON(parent && parent != buf);
+                spin_lock(&root->node_lock);
+                root->node = cow;
+                extent_buffer_get(cow);
+                spin_unlock(&root->node_lock);
+                if (buf != root->commit_root) {
+                        btrfs_free_extent(trans, root, buf->start,
+                                          buf->len, buf->start,
+                                          root->root_key.objectid,
+                                          btrfs_header_generation(buf),
+                                          level, 1);
+                }
+                free_extent_buffer(buf);
+                add_root_to_dirty_list(root);
+        } else {
+                btrfs_set_node_blockptr(parent, parent_slot,
+                                        cow->start);
+                WARN_ON(trans->transid == 0);
+                btrfs_set_node_ptr_generation(parent, parent_slot,
+                                              trans->transid);
+                btrfs_mark_buffer_dirty(parent);
+                WARN_ON(btrfs_header_generation(parent) != trans->transid);
+                btrfs_free_extent(trans, root, buf->start, buf->len,
+                                  parent_start, btrfs_header_owner(parent),
+                                  btrfs_header_generation(parent), level, 1);
+        }
+        if (unlock_orig)
+                btrfs_tree_unlock(buf);
+        free_extent_buffer(buf);
+        btrfs_mark_buffer_dirty(cow);
+        *cow_ret = cow;
+        return 0;
+}
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct extent_buffer *buf,
+                    struct extent_buffer *parent, int parent_slot,
+                    struct extent_buffer **cow_ret, u64 prealloc_dest)
+{
+        u64 search_start;
+        int ret;
+        if (trans->transaction != root->fs_info->running_transaction) {
+                printk(KERN_CRIT "trans %llu running %llu\n",
+                       (unsigned long long)trans->transid,
+                       (unsigned long long)
+                       root->fs_info->running_transaction->transid);
+                WARN_ON(1);
+        }
+        if (trans->transid != root->fs_info->generation) {
+                printk(KERN_CRIT "trans %llu running %llu\n",
+                       (unsigned long long)trans->transid,
+                       (unsigned long long)root->fs_info->generation);
+                WARN_ON(1);
+        }
+        spin_lock(&root->fs_info->hash_lock);
+        if (btrfs_header_generation(buf) == trans->transid &&
+            btrfs_header_owner(buf) == root->root_key.objectid &&
+            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+                *cow_ret = buf;
+                spin_unlock(&root->fs_info->hash_lock);
+                WARN_ON(prealloc_dest);
+                return 0;
+        }
+        spin_unlock(&root->fs_info->hash_lock);
+        search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+        ret = __btrfs_cow_block(trans, root, buf, parent,
+                                 parent_slot, cow_ret, search_start, 0,
+                                 prealloc_dest);
+        return ret;
+}
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
+{
+        if (blocknr < other && other - (blocknr + blocksize) < 32768)
+                return 1;
+        if (blocknr > other && blocknr - (other + blocksize) < 32768)
+                return 1;
+        return 0;
+}
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+{
+        struct btrfs_key k1;
+        btrfs_disk_key_to_cpu(&k1, disk);
+        if (k1.objectid > k2->objectid)
+                return 1;
+        if (k1.objectid < k2->objectid)
+                return -1;
+        if (k1.type > k2->type)
+                return 1;
+        if (k1.type < k2->type)
+                return -1;
+        if (k1.offset > k2->offset)
+                return 1;
+        if (k1.offset < k2->offset)
+                return -1;
+        return 0;
+}
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+        if (k1->objectid > k2->objectid)
+                return 1;
+        if (k1->objectid < k2->objectid)
+                return -1;
+        if (k1->type > k2->type)
+                return 1;
+        if (k1->type < k2->type)
+                return -1;
+        if (k1->offset > k2->offset)
+                return 1;
+        if (k1->offset < k2->offset)
+                return -1;
+        return 0;
+}
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct extent_buffer *parent,
+                       int start_slot, int cache_only, u64 *last_ret,
+                       struct btrfs_key *progress)
+{
+        struct extent_buffer *cur;
+        u64 blocknr;
+        u64 gen;
+        u64 search_start = *last_ret;
+        u64 last_block = 0;
+        u64 other;
+        u32 parent_nritems;
+        int end_slot;
+        int i;
+        int err = 0;
+        int parent_level;
+        int uptodate;
+        u32 blocksize;
+        int progress_passed = 0;
+        struct btrfs_disk_key disk_key;
+        parent_level = btrfs_header_level(parent);
+        if (cache_only && parent_level != 1)
+                return 0;
+        if (trans->transaction != root->fs_info->running_transaction)
+                WARN_ON(1);
+        if (trans->transid != root->fs_info->generation)
+                WARN_ON(1);
+        parent_nritems = btrfs_header_nritems(parent);
+        blocksize = btrfs_level_size(root, parent_level - 1);
+        end_slot = parent_nritems;
+        if (parent_nritems == 1)
+                return 0;
+        for (i = start_slot; i < end_slot; i++) {
+                int close = 1;
+                if (!parent->map_token) {
+                        map_extent_buffer(parent,
+                                        btrfs_node_key_ptr_offset(i),
+                                        sizeof(struct btrfs_key_ptr),
+                                        &parent->map_token, &parent->kaddr,
+                                        &parent->map_start, &parent->map_len,
+                                        KM_USER1);
+                }
+                btrfs_node_key(parent, &disk_key, i);
+                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+                        continue;
+                progress_passed = 1;
+                blocknr = btrfs_node_blockptr(parent, i);
+                gen = btrfs_node_ptr_generation(parent, i);
+                if (last_block == 0)
+                        last_block = blocknr;
+                if (i > 0) {
+                        other = btrfs_node_blockptr(parent, i - 1);
+                        close = close_blocks(blocknr, other, blocksize);
+                }
+                if (!close && i < end_slot - 2) {
+                        other = btrfs_node_blockptr(parent, i + 1);
+                        close = close_blocks(blocknr, other, blocksize);
+                }
+                if (close) {
+                        last_block = blocknr;
+                        continue;
+                }
+                if (parent->map_token) {
+                        unmap_extent_buffer(parent, parent->map_token,
+                                            KM_USER1);
+                        parent->map_token = NULL;
+                }
+                cur = btrfs_find_tree_block(root, blocknr, blocksize);
+                if (cur)
+                        uptodate = btrfs_buffer_uptodate(cur, gen);
+                else
+                        uptodate = 0;
+                if (!cur || !uptodate) {
+                        if (cache_only) {
+                                free_extent_buffer(cur);
+                                continue;
+                        }
+                        if (!cur) {
+                                cur = read_tree_block(root, blocknr,
+                                                         blocksize, gen);
+                        } else if (!uptodate) {
+                                btrfs_read_buffer(cur, gen);
+                        }
+                }
+                if (search_start == 0)
+                        search_start = last_block;
+                btrfs_tree_lock(cur);
+                err = __btrfs_cow_block(trans, root, cur, parent, i,
+                                        &cur, search_start,
+                                        min(16 * blocksize,
+                                            (end_slot - i) * blocksize), 0);
+                if (err) {
+                        btrfs_tree_unlock(cur);
+                        free_extent_buffer(cur);
+                        break;
+                }
+                search_start = cur->start;
+                last_block = cur->start;
+                *last_ret = search_start;
+                btrfs_tree_unlock(cur);
+                free_extent_buffer(cur);
+        }
+        if (parent->map_token) {
+                unmap_extent_buffer(parent, parent->map_token,
+                                    KM_USER1);
+                parent->map_token = NULL;
+        }
+        return err;
+}
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+                                         struct extent_buffer *leaf)
+{
+        u32 nr = btrfs_header_nritems(leaf);
+        if (nr == 0)
+                return BTRFS_LEAF_DATA_SIZE(root);
+        return btrfs_item_offset_nr(leaf, nr - 1);
+}
+/*
+ * extra debugging checks to make sure all the items in a key are
+ * well formed and in the proper order
+ */
+static int check_node(struct btrfs_root *root, struct btrfs_path *path,
+                      int level)
+{
+        struct extent_buffer *parent = NULL;
+        struct extent_buffer *node = path->nodes[level];
+        struct btrfs_disk_key parent_key;
+        struct btrfs_disk_key node_key;
+        int parent_slot;
+        int slot;
+        struct btrfs_key cpukey;
+        u32 nritems = btrfs_header_nritems(node);
+        if (path->nodes[level + 1])
+                parent = path->nodes[level + 1];
+        slot = path->slots[level];
+        BUG_ON(nritems == 0);
+        if (parent) {
+                parent_slot = path->slots[level + 1];
+                btrfs_node_key(parent, &parent_key, parent_slot);
+                btrfs_node_key(node, &node_key, 0);
+                BUG_ON(memcmp(&parent_key, &node_key,
+                              sizeof(struct btrfs_disk_key)));
+                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+                       btrfs_header_bytenr(node));
+        }
+        BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
+        if (slot != 0) {
+                btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
+                btrfs_node_key(node, &node_key, slot);
+                BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
+        }
+        if (slot < nritems - 1) {
+                btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
+                btrfs_node_key(node, &node_key, slot);
+                BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
+        }
+        return 0;
+}
+/*
+ * extra checking to make sure all the items in a leaf are
+ * well formed and in the proper order
+ */
+static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
+                      int level)
+{
+        struct extent_buffer *leaf = path->nodes[level];
+        struct extent_buffer *parent = NULL;
+        int parent_slot;
+        struct btrfs_key cpukey;
+        struct btrfs_disk_key parent_key;
+        struct btrfs_disk_key leaf_key;
+        int slot = path->slots[0];
+        u32 nritems = btrfs_header_nritems(leaf);
+        if (path->nodes[level + 1])
+                parent = path->nodes[level + 1];
+        if (nritems == 0)
+                return 0;
+        if (parent) {
+                parent_slot = path->slots[level + 1];
+                btrfs_node_key(parent, &parent_key, parent_slot);
+                btrfs_item_key(leaf, &leaf_key, 0);
+                BUG_ON(memcmp(&parent_key, &leaf_key,
+                       sizeof(struct btrfs_disk_key)));
+                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
+                       btrfs_header_bytenr(leaf));
+        }
+        if (slot != 0 && slot < nritems - 1) {
+                btrfs_item_key(leaf, &leaf_key, slot);
+                btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
+                if (comp_keys(&leaf_key, &cpukey) <= 0) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d offset bad key\n", slot);
+                        BUG_ON(1);
+                }
+                if (btrfs_item_offset_nr(leaf, slot - 1) !=
+                       btrfs_item_end_nr(leaf, slot)) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d offset bad\n", slot);
+                        BUG_ON(1);
+                }
+        }
+        if (slot < nritems - 1) {
+                btrfs_item_key(leaf, &leaf_key, slot);
+                btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
+                BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
+                if (btrfs_item_offset_nr(leaf, slot) !=
+                        btrfs_item_end_nr(leaf, slot + 1)) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d offset bad\n", slot);
+                        BUG_ON(1);
+                }
+        }
+        BUG_ON(btrfs_item_offset_nr(leaf, 0) +
+               btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
+        return 0;
+}
+static noinline int check_block(struct btrfs_root *root,
+                                struct btrfs_path *path, int level)
+{
+        return 0;
+        if (level == 0)
+                return check_leaf(root, path, level);
+        return check_node(root, path, level);
+}
+/*
+ * search for key in the extent_buffer.  The items start at offset p,
+ * and they are item_size apart.  There are 'max' items in p.
+ *
+ * the slot in the array is returned via slot, and it points to
+ * the place where you would insert key if it is not found in
+ * the array.
+ *
+ * slot may point to max if the key is bigger than all of the keys
+ */
+static noinline int generic_bin_search(struct extent_buffer *eb,
+                                       unsigned long p,
+                                       int item_size, struct btrfs_key *key,
+                                       int max, int *slot)
+{
+        int low = 0;
+        int high = max;
+        int mid;
+        int ret;
+        struct btrfs_disk_key *tmp = NULL;
+        struct btrfs_disk_key unaligned;
+        unsigned long offset;
+        char *map_token = NULL;
+        char *kaddr = NULL;
+        unsigned long map_start = 0;
+        unsigned long map_len = 0;
+        int err;
+        while (low < high) {
+                mid = (low + high) / 2;
+                offset = p + mid * item_size;
+                if (!map_token || offset < map_start ||
+                    (offset + sizeof(struct btrfs_disk_key)) >
+                    map_start + map_len) {
+                        if (map_token) {
+                                unmap_extent_buffer(eb, map_token, KM_USER0);
+                                map_token = NULL;
+                        }
+                        err = map_private_extent_buffer(eb, offset,
+                                                sizeof(struct btrfs_disk_key),
+                                                &map_token, &kaddr,
+                                                &map_start, &map_len, KM_USER0);
+                        if (!err) {
+                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
+                                                        map_start);
+                        } else {
+                                read_extent_buffer(eb, &unaligned,
+                                                   offset, sizeof(unaligned));
+                                tmp = &unaligned;
+                        }
+                } else {
+                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
+                                                        map_start);
+                }
+                ret = comp_keys(tmp, key);
+                if (ret < 0)
+                        low = mid + 1;
+                else if (ret > 0)
+                        high = mid;
+                else {
+                        *slot = mid;
+                        if (map_token)
+                                unmap_extent_buffer(eb, map_token, KM_USER0);
+                        return 0;
+                }
+        }
+        *slot = low;
+        if (map_token)
+                unmap_extent_buffer(eb, map_token, KM_USER0);
+        return 1;
+}
+/*
+ * simple bin_search frontend that does the right thing for
+ * leaves vs nodes
+ */
+static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+                      int level, int *slot)
+{
+        if (level == 0) {
+                return generic_bin_search(eb,
+                                          offsetof(struct btrfs_leaf, items),
+                                          sizeof(struct btrfs_item),
+                                          key, btrfs_header_nritems(eb),
+                                          slot);
+        } else {
+                return generic_bin_search(eb,
+                                          offsetof(struct btrfs_node, ptrs),
+                                          sizeof(struct btrfs_key_ptr),
+                                          key, btrfs_header_nritems(eb),
+                                          slot);
+        }
+        return -1;
+}
+/* given a node and slot number, this reads the blocks it points to.  The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
+                                   struct extent_buffer *parent, int slot)
+{
+        int level = btrfs_header_level(parent);
+        if (slot < 0)
+                return NULL;
+        if (slot >= btrfs_header_nritems(parent))
+                return NULL;
+        BUG_ON(level == 0);
+        return read_tree_block(root, btrfs_node_blockptr(parent, slot),
+                       btrfs_level_size(root, level - 1),
+                       btrfs_node_ptr_generation(parent, slot));
+}
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion.  We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
+static noinline int balance_level(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path, int level)
+{
+        struct extent_buffer *right = NULL;
+        struct extent_buffer *mid;
+        struct extent_buffer *left = NULL;
+        struct extent_buffer *parent = NULL;
+        int ret = 0;
+        int wret;
+        int pslot;
+        int orig_slot = path->slots[level];
+        int err_on_enospc = 0;
+        u64 orig_ptr;
+        if (level == 0)
+                return 0;
+        mid = path->nodes[level];
+        WARN_ON(!path->locks[level]);
+        WARN_ON(btrfs_header_generation(mid) != trans->transid);
+        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+        if (level < BTRFS_MAX_LEVEL - 1)
+                parent = path->nodes[level + 1];
+        pslot = path->slots[level + 1];
+        /*
+         * deal with the case where there is only one pointer in the root
+         * by promoting the node below to a root
+         */
+        if (!parent) {
+                struct extent_buffer *child;
+                if (btrfs_header_nritems(mid) != 1)
+                        return 0;
+                /* promote the child to a root */
+                child = read_node_slot(root, mid, 0);
+                btrfs_tree_lock(child);
+                BUG_ON(!child);
+                ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+                BUG_ON(ret);
+                spin_lock(&root->node_lock);
+                root->node = child;
+                spin_unlock(&root->node_lock);
+                ret = btrfs_update_extent_ref(trans, root, child->start,
+                                              mid->start, child->start,
+                                              root->root_key.objectid,
+                                              trans->transid, level - 1);
+                BUG_ON(ret);
+                add_root_to_dirty_list(root);
+                btrfs_tree_unlock(child);
+                path->locks[level] = 0;
+                path->nodes[level] = NULL;
+                clean_tree_block(trans, root, mid);
+                btrfs_tree_unlock(mid);
+                /* once for the path */
+                free_extent_buffer(mid);
+                ret = btrfs_free_extent(trans, root, mid->start, mid->len,
+                                        mid->start, root->root_key.objectid,
+                                        btrfs_header_generation(mid),
+                                        level, 1);
+                /* once for the root ptr */
+                free_extent_buffer(mid);
+                return ret;
+        }
+        if (btrfs_header_nritems(mid) >
+            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
+                return 0;
+        if (btrfs_header_nritems(mid) < 2)
+                err_on_enospc = 1;
+        left = read_node_slot(root, parent, pslot - 1);
+        if (left) {
+                btrfs_tree_lock(left);
+                wret = btrfs_cow_block(trans, root, left,
+                                       parent, pslot - 1, &left, 0);
+                if (wret) {
+                        ret = wret;
+                        goto enospc;
+                }
+        }
+        right = read_node_slot(root, parent, pslot + 1);
+        if (right) {
+                btrfs_tree_lock(right);
+                wret = btrfs_cow_block(trans, root, right,
+                                       parent, pslot + 1, &right, 0);
+                if (wret) {
+                        ret = wret;
+                        goto enospc;
+                }
+        }
+        /* first, try to make some room in the middle buffer */
+        if (left) {
+                orig_slot += btrfs_header_nritems(left);
+                wret = push_node_left(trans, root, left, mid, 1);
+                if (wret < 0)
+                        ret = wret;
+                if (btrfs_header_nritems(mid) < 2)
+                        err_on_enospc = 1;
+        }
+        /*
+         * then try to empty the right most buffer into the middle
+         */
+        if (right) {
+                wret = push_node_left(trans, root, mid, right, 1);
+                if (wret < 0 && wret != -ENOSPC)
+                        ret = wret;
+                if (btrfs_header_nritems(right) == 0) {
+                        u64 bytenr = right->start;
+                        u64 generation = btrfs_header_generation(parent);
+                        u32 blocksize = right->len;
+                        clean_tree_block(trans, root, right);
+                        btrfs_tree_unlock(right);
+                        free_extent_buffer(right);
+                        right = NULL;
+                        wret = del_ptr(trans, root, path, level + 1, pslot +
+                                       1);
+                        if (wret)
+                                ret = wret;
+                        wret = btrfs_free_extent(trans, root, bytenr,
+                                                 blocksize, parent->start,
+                                                 btrfs_header_owner(parent),
+                                                 generation, level, 1);
+                        if (wret)
+                                ret = wret;
+                } else {
+                        struct btrfs_disk_key right_key;
+                        btrfs_node_key(right, &right_key, 0);
+                        btrfs_set_node_key(parent, &right_key, pslot + 1);
+                        btrfs_mark_buffer_dirty(parent);
+                }
+        }
+        if (btrfs_header_nritems(mid) == 1) {
+                /*
+                 * we're not allowed to leave a node with one item in the
+                 * tree during a delete.  A deletion from lower in the tree
+                 * could try to delete the only pointer in this node.
+                 * So, pull some keys from the left.
+                 * There has to be a left pointer at this point because
+                 * otherwise we would have pulled some pointers from the
+                 * right
+                 */
+                BUG_ON(!left);
+                wret = balance_node_right(trans, root, mid, left);
+                if (wret < 0) {
+                        ret = wret;
+                        goto enospc;
+                }
+                if (wret == 1) {
+                        wret = push_node_left(trans, root, left, mid, 1);
+                        if (wret < 0)
+                                ret = wret;
+                }
+                BUG_ON(wret == 1);
+        }
+        if (btrfs_header_nritems(mid) == 0) {
+                /* we've managed to empty the middle node, drop it */
+                u64 root_gen = btrfs_header_generation(parent);
+                u64 bytenr = mid->start;
+                u32 blocksize = mid->len;
+                clean_tree_block(trans, root, mid);
+                btrfs_tree_unlock(mid);
+                free_extent_buffer(mid);
+                mid = NULL;
+                wret = del_ptr(trans, root, path, level + 1, pslot);
+                if (wret)
+                        ret = wret;
+                wret = btrfs_free_extent(trans, root, bytenr, blocksize,
+                                         parent->start,
+                                         btrfs_header_owner(parent),
+                                         root_gen, level, 1);
+                if (wret)
+                        ret = wret;
+        } else {
+                /* update the parent key to reflect our changes */
+                struct btrfs_disk_key mid_key;
+                btrfs_node_key(mid, &mid_key, 0);
+                btrfs_set_node_key(parent, &mid_key, pslot);
+                btrfs_mark_buffer_dirty(parent);
+        }
+        /* update the path */
+        if (left) {
+                if (btrfs_header_nritems(left) > orig_slot) {
+                        extent_buffer_get(left);
+                        /* left was locked after cow */
+                        path->nodes[level] = left;
+                        path->slots[level + 1] -= 1;
+                        path->slots[level] = orig_slot;
+                        if (mid) {
+                                btrfs_tree_unlock(mid);
+                                free_extent_buffer(mid);
+                        }
+                } else {
+                        orig_slot -= btrfs_header_nritems(left);
+                        path->slots[level] = orig_slot;
+                }
+        }
+        /* double check we haven't messed things up */
+        check_block(root, path, level);
+        if (orig_ptr !=
+            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
+                BUG();
+enospc:
+        if (right) {
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        if (left) {
+                if (path->nodes[level] != left)
+                        btrfs_tree_unlock(left);
+                free_extent_buffer(left);
+        }
+        return ret;
+}
+/* Node balancing for insertion.  Here we only split or push nodes around
+ * when they are completely full.  This is also done top down, so we
+ * have to be pessimistic.
+ */
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path, int level)
+{
+        struct extent_buffer *right = NULL;
+        struct extent_buffer *mid;
+        struct extent_buffer *left = NULL;
+        struct extent_buffer *parent = NULL;
+        int ret = 0;
+        int wret;
+        int pslot;
+        int orig_slot = path->slots[level];
+        u64 orig_ptr;
+        if (level == 0)
+                return 1;
+        mid = path->nodes[level];
+        WARN_ON(btrfs_header_generation(mid) != trans->transid);
+        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+        if (level < BTRFS_MAX_LEVEL - 1)
+                parent = path->nodes[level + 1];
+        pslot = path->slots[level + 1];
+        if (!parent)
+                return 1;
+        left = read_node_slot(root, parent, pslot - 1);
+        /* first, try to make some room in the middle buffer */
+        if (left) {
+                u32 left_nr;
+                btrfs_tree_lock(left);
+                left_nr = btrfs_header_nritems(left);
+                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+                        wret = 1;
+                } else {
+                        ret = btrfs_cow_block(trans, root, left, parent,
+                                              pslot - 1, &left, 0);
+                        if (ret)
+                                wret = 1;
+                        else {
+                                wret = push_node_left(trans, root,
+                                                      left, mid, 0);
+                        }
+                }
+                if (wret < 0)
+                        ret = wret;
+                if (wret == 0) {
+                        struct btrfs_disk_key disk_key;
+                        orig_slot += left_nr;
+                        btrfs_node_key(mid, &disk_key, 0);
+                        btrfs_set_node_key(parent, &disk_key, pslot);
+                        btrfs_mark_buffer_dirty(parent);
+                        if (btrfs_header_nritems(left) > orig_slot) {
+                                path->nodes[level] = left;
+                                path->slots[level + 1] -= 1;
+                                path->slots[level] = orig_slot;
+                                btrfs_tree_unlock(mid);
+                                free_extent_buffer(mid);
+                        } else {
+                                orig_slot -=
+                                        btrfs_header_nritems(left);
+                                path->slots[level] = orig_slot;
+                                btrfs_tree_unlock(left);
+                                free_extent_buffer(left);
+                        }
+                        return 0;
+                }
+                btrfs_tree_unlock(left);
+                free_extent_buffer(left);
+        }
+        right = read_node_slot(root, parent, pslot + 1);
+        /*
+         * then try to empty the right most buffer into the middle
+         */
+        if (right) {
+                u32 right_nr;
+                btrfs_tree_lock(right);
+                right_nr = btrfs_header_nritems(right);
+                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+                        wret = 1;
+                } else {
+                        ret = btrfs_cow_block(trans, root, right,
+                                              parent, pslot + 1,
+                                              &right, 0);
+                        if (ret)
+                                wret = 1;
+                        else {
+                                wret = balance_node_right(trans, root,
+                                                          right, mid);
+                        }
+                }
+                if (wret < 0)
+                        ret = wret;
+                if (wret == 0) {
+                        struct btrfs_disk_key disk_key;
+                        btrfs_node_key(right, &disk_key, 0);
+                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
+                        btrfs_mark_buffer_dirty(parent);
+                        if (btrfs_header_nritems(mid) <= orig_slot) {
+                                path->nodes[level] = right;
+                                path->slots[level + 1] += 1;
+                                path->slots[level] = orig_slot -
+                                        btrfs_header_nritems(mid);
+                                btrfs_tree_unlock(mid);
+                                free_extent_buffer(mid);
+                        } else {
+                                btrfs_tree_unlock(right);
+                                free_extent_buffer(right);
+                        }
+                        return 0;
+                }
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        return 1;
+}
+/*
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
+ */
+static noinline void reada_for_search(struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      int level, int slot, u64 objectid)
+{
+        struct extent_buffer *node;
+        struct btrfs_disk_key disk_key;
+        u32 nritems;
+        u64 search;
+        u64 lowest_read;
+        u64 highest_read;
+        u64 nread = 0;
+        int direction = path->reada;
+        struct extent_buffer *eb;
+        u32 nr;
+        u32 blocksize;
+        u32 nscan = 0;
+        if (level != 1)
+                return;
+        if (!path->nodes[level])
+                return;
+        node = path->nodes[level];
+        search = btrfs_node_blockptr(node, slot);
+        blocksize = btrfs_level_size(root, level - 1);
+        eb = btrfs_find_tree_block(root, search, blocksize);
+        if (eb) {
+                free_extent_buffer(eb);
+                return;
+        }
+        highest_read = search;
+        lowest_read = search;
+        nritems = btrfs_header_nritems(node);
+        nr = slot;
+        while (1) {
+                if (direction < 0) {
+                        if (nr == 0)
+                                break;
+                        nr--;
+                } else if (direction > 0) {
+                        nr++;
+                        if (nr >= nritems)
+                                break;
+                }
+                if (path->reada < 0 && objectid) {
+                        btrfs_node_key(node, &disk_key, nr);
+                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
+                                break;
+                }
+                search = btrfs_node_blockptr(node, nr);
+                if ((search >= lowest_read && search <= highest_read) ||
+                    (search < lowest_read && lowest_read - search <= 16384) ||
+                    (search > highest_read && search - highest_read <= 16384)) {
+                        readahead_tree_block(root, search, blocksize,
+                                     btrfs_node_ptr_generation(node, nr));
+                        nread += blocksize;
+                }
+                nscan++;
+                if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
+                        break;
+                if (nread > (256 * 1024) || nscan > 128)
+                        break;
+                if (search < lowest_read)
+                        lowest_read = search;
+                if (search > highest_read)
+                        highest_read = search;
+        }
+}
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree.  The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block.  This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
+ * if lowest_unlock is 1, level 0 won't be unlocked
+ */
+static noinline void unlock_up(struct btrfs_path *path, int level,
+                               int lowest_unlock)
+{
+        int i;
+        int skip_level = level;
+        int no_skips = 0;
+        struct extent_buffer *t;
+        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+                if (!path->nodes[i])
+                        break;
+                if (!path->locks[i])
+                        break;
+                if (!no_skips && path->slots[i] == 0) {
+                        skip_level = i + 1;
+                        continue;
+                }
+                if (!no_skips && path->keep_locks) {
+                        u32 nritems;
+                        t = path->nodes[i];
+                        nritems = btrfs_header_nritems(t);
+                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
+                                skip_level = i + 1;
+                                continue;
+                        }
+                }
+                if (skip_level < i && i >= lowest_unlock)
+                        no_skips = 1;
+                t = path->nodes[i];
+                if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+                        btrfs_tree_unlock(t);
+                        path->locks[i] = 0;
+                }
+        }
+}
+/*
+ * look for key in the tree.  path is filled in with nodes along the way
+ * if key is found, we return zero and you can find the item in the leaf
+ * level of the path (level 0)
+ *
+ * If the key isn't found, the path points to the slot where it should
+ * be inserted, and 1 is returned.  If there are other errors during the
+ * search a negative error number is returned.
+ *
+ * if ins_len > 0, nodes and leaves will be split as we walk down the
+ * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
+ * possible)
+ */
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_path *p, int
+                      ins_len, int cow)
+{
+        struct extent_buffer *b;
+        struct extent_buffer *tmp;
+        int slot;
+        int ret;
+        int level;
+        int should_reada = p->reada;
+        int lowest_unlock = 1;
+        int blocksize;
+        u8 lowest_level = 0;
+        u64 blocknr;
+        u64 gen;
+        struct btrfs_key prealloc_block;
+        lowest_level = p->lowest_level;
+        WARN_ON(lowest_level && ins_len > 0);
+        WARN_ON(p->nodes[0] != NULL);
+        if (ins_len < 0)
+                lowest_unlock = 2;
+        prealloc_block.objectid = 0;
+again:
+        if (p->skip_locking)
+                b = btrfs_root_node(root);
+        else
+                b = btrfs_lock_root_node(root);
+        while (b) {
+                level = btrfs_header_level(b);
+                /*
+                 * setup the path here so we can release it under lock
+                 * contention with the cow code
+                 */
+                p->nodes[level] = b;
+                if (!p->skip_locking)
+                        p->locks[level] = 1;
+                if (cow) {
+                        int wret;
+                        /* is a cow on this block not required */
+                        spin_lock(&root->fs_info->hash_lock);
+                        if (btrfs_header_generation(b) == trans->transid &&
+                            btrfs_header_owner(b) == root->root_key.objectid &&
+                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
+                                spin_unlock(&root->fs_info->hash_lock);
+                                goto cow_done;
+                        }
+                        spin_unlock(&root->fs_info->hash_lock);
+                        /* ok, we have to cow, is our old prealloc the right
+                         * size?
+                         */
+                        if (prealloc_block.objectid &&
+                            prealloc_block.offset != b->len) {
+                                btrfs_free_reserved_extent(root,
+                                           prealloc_block.objectid,
+                                           prealloc_block.offset);
+                                prealloc_block.objectid = 0;
+                        }
+                        /*
+                         * for higher level blocks, try not to allocate blocks
+                         * with the block and the parent locks held.
+                         */
+                        if (level > 1 && !prealloc_block.objectid &&
+                            btrfs_path_lock_waiting(p, level)) {
+                                u32 size = b->len;
+                                u64 hint = b->start;
+                                btrfs_release_path(root, p);
+                                ret = btrfs_reserve_extent(trans, root,
+                                                           size, size, 0,
+                                                           hint, (u64)-1,
+                                                           &prealloc_block, 0);
+                                BUG_ON(ret);
+                                goto again;
+                        }
+                        wret = btrfs_cow_block(trans, root, b,
+                                               p->nodes[level + 1],
+                                               p->slots[level + 1],
+                                               &b, prealloc_block.objectid);
+                        prealloc_block.objectid = 0;
+                        if (wret) {
+                                free_extent_buffer(b);
+                                ret = wret;
+                                goto done;
+                        }
+                }
+cow_done:
+                BUG_ON(!cow && ins_len);
+                if (level != btrfs_header_level(b))
+                        WARN_ON(1);
+                level = btrfs_header_level(b);
+                p->nodes[level] = b;
+                if (!p->skip_locking)
+                        p->locks[level] = 1;
+                ret = check_block(root, p, level);
+                if (ret) {
+                        ret = -1;
+                        goto done;
+                }
+                ret = bin_search(b, key, level, &slot);
+                if (level != 0) {
+                        if (ret && slot > 0)
+                                slot -= 1;
+                        p->slots[level] = slot;
+                        if ((p->search_for_split || ins_len > 0) &&
+                            btrfs_header_nritems(b) >=
+                            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
+                                int sret = split_node(trans, root, p, level);
+                                BUG_ON(sret > 0);
+                                if (sret) {
+                                        ret = sret;
+                                        goto done;
+                                }
+                                b = p->nodes[level];
+                                slot = p->slots[level];
+                        } else if (ins_len < 0) {
+                                int sret = balance_level(trans, root, p,
+                                                         level);
+                                if (sret) {
+                                        ret = sret;
+                                        goto done;
+                                }
+                                b = p->nodes[level];
+                                if (!b) {
+                                        btrfs_release_path(NULL, p);
+                                        goto again;
+                                }
+                                slot = p->slots[level];
+                                BUG_ON(btrfs_header_nritems(b) == 1);
+                        }
+                        unlock_up(p, level, lowest_unlock);
+                        /* this is only true while dropping a snapshot */
+                        if (level == lowest_level) {
+                                ret = 0;
+                                goto done;
+                        }
+                        blocknr = btrfs_node_blockptr(b, slot);
+                        gen = btrfs_node_ptr_generation(b, slot);
+                        blocksize = btrfs_level_size(root, level - 1);
+                        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+                        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+                                b = tmp;
+                        } else {
+                                /*
+                                 * reduce lock contention at high levels
+                                 * of the btree by dropping locks before
+                                 * we read.
+                                 */
+                                if (level > 1) {
+                                        btrfs_release_path(NULL, p);
+                                        if (tmp)
+                                                free_extent_buffer(tmp);
+                                        if (should_reada)
+                                                reada_for_search(root, p,
+                                                                 level, slot,
+                                                                 key->objectid);
+                                        tmp = read_tree_block(root, blocknr,
+                                                         blocksize, gen);
+                                        if (tmp)
+                                                free_extent_buffer(tmp);
+                                        goto again;
+                                } else {
+                                        if (tmp)
+                                                free_extent_buffer(tmp);
+                                        if (should_reada)
+                                                reada_for_search(root, p,
+                                                                 level, slot,
+                                                                 key->objectid);
+                                        b = read_node_slot(root, b, slot);
+                                }
+                        }
+                        if (!p->skip_locking)
+                                btrfs_tree_lock(b);
+                } else {
+                        p->slots[level] = slot;
+                        if (ins_len > 0 &&
+                            btrfs_leaf_free_space(root, b) < ins_len) {
+                                int sret = split_leaf(trans, root, key,
+                                                      p, ins_len, ret == 0);
+                                BUG_ON(sret > 0);
+                                if (sret) {
+                                        ret = sret;
+                                        goto done;
+                                }
+                        }
+                        if (!p->search_for_split)
+                                unlock_up(p, level, lowest_unlock);
+                        goto done;
+                }
+        }
+        ret = 1;
+done:
+        if (prealloc_block.objectid) {
+                btrfs_free_reserved_extent(root,
+                           prealloc_block.objectid,
+                           prealloc_block.offset);
+        }
+        return ret;
+}
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_key *node_keys,
+                     u64 *nodes, int lowest_level)
+{
+        struct extent_buffer *eb;
+        struct extent_buffer *parent;
+        struct btrfs_key key;
+        u64 bytenr;
+        u64 generation;
+        u32 blocksize;
+        int level;
+        int slot;
+        int key_match;
+        int ret;
+        eb = btrfs_lock_root_node(root);
+        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+        BUG_ON(ret);
+        parent = eb;
+        while (1) {
+                level = btrfs_header_level(parent);
+                if (level == 0 || level <= lowest_level)
+                        break;
+                ret = bin_search(parent, &node_keys[lowest_level], level,
+                                 &slot);
+                if (ret && slot > 0)
+                        slot--;
+                bytenr = btrfs_node_blockptr(parent, slot);
+                if (nodes[level - 1] == bytenr)
+                        break;
+                blocksize = btrfs_level_size(root, level - 1);
+                generation = btrfs_node_ptr_generation(parent, slot);
+                btrfs_node_key_to_cpu(eb, &key, slot);
+                key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
+                if (generation == trans->transid) {
+                        eb = read_tree_block(root, bytenr, blocksize,
+                                             generation);
+                        btrfs_tree_lock(eb);
+                }
+                /*
+                 * if node keys match and node pointer hasn't been modified
+                 * in the running transaction, we can merge the path. for
+                 * blocks owened by reloc trees, the node pointer check is
+                 * skipped, this is because these blocks are fully controlled
+                 * by the space balance code, no one else can modify them.
+                 */
+                if (!nodes[level - 1] || !key_match ||
+                    (generation == trans->transid &&
+                     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
+                        if (level == 1 || level == lowest_level + 1) {
+                                if (generation == trans->transid) {
+                                        btrfs_tree_unlock(eb);
+                                        free_extent_buffer(eb);
+                                }
+                                break;
+                        }
+                        if (generation != trans->transid) {
+                                eb = read_tree_block(root, bytenr, blocksize,
+                                                generation);
+                                btrfs_tree_lock(eb);
+                        }
+                        ret = btrfs_cow_block(trans, root, eb, parent, slot,
+                                              &eb, 0);
+                        BUG_ON(ret);
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID) {
+                                if (!nodes[level - 1]) {
+                                        nodes[level - 1] = eb->start;
+                                        memcpy(&node_keys[level - 1], &key,
+                                               sizeof(node_keys[0]));
+                                } else {
+                                        WARN_ON(1);
+                                }
+                        }
+                        btrfs_tree_unlock(parent);
+                        free_extent_buffer(parent);
+                        parent = eb;
+                        continue;
+                }
+                btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
+                btrfs_set_node_ptr_generation(parent, slot, trans->transid);
+                btrfs_mark_buffer_dirty(parent);
+                ret = btrfs_inc_extent_ref(trans, root,
+                                        nodes[level - 1],
+                                        blocksize, parent->start,
+                                        btrfs_header_owner(parent),
+                                        btrfs_header_generation(parent),
+                                        level - 1);
+                BUG_ON(ret);
+                /*
+                 * If the block was created in the running transaction,
+                 * it's possible this is the last reference to it, so we
+                 * should drop the subtree.
+                 */
+                if (generation == trans->transid) {
+                        ret = btrfs_drop_subtree(trans, root, eb, parent);
+                        BUG_ON(ret);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
+                } else {
+                        ret = btrfs_free_extent(trans, root, bytenr,
+                                        blocksize, parent->start,
+                                        btrfs_header_owner(parent),
+                                        btrfs_header_generation(parent),
+                                        level - 1, 1);
+                        BUG_ON(ret);
+                }
+                break;
+        }
+        btrfs_tree_unlock(parent);
+        free_extent_buffer(parent);
+        return 0;
+}
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ *
+ * If this fails to write a tree block, it returns -1, but continues
+ * fixing up the blocks in ram so the tree is consistent.
+ */
+static int fixup_low_keys(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct btrfs_path *path,
+                          struct btrfs_disk_key *key, int level)
+{
+        int i;
+        int ret = 0;
+        struct extent_buffer *t;
+        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+                int tslot = path->slots[i];
+                if (!path->nodes[i])
+                        break;
+                t = path->nodes[i];
+                btrfs_set_node_key(t, key, tslot);
+                btrfs_mark_buffer_dirty(path->nodes[i]);
+                if (tslot != 0)
+                        break;
+        }
+        return ret;
+}
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, struct btrfs_path *path,
+                            struct btrfs_key *new_key)
+{
+        struct btrfs_disk_key disk_key;
+        struct extent_buffer *eb;
+        int slot;
+        eb = path->nodes[0];
+        slot = path->slots[0];
+        if (slot > 0) {
+                btrfs_item_key(eb, &disk_key, slot - 1);
+                if (comp_keys(&disk_key, new_key) >= 0)
+                        return -1;
+        }
+        if (slot < btrfs_header_nritems(eb) - 1) {
+                btrfs_item_key(eb, &disk_key, slot + 1);
+                if (comp_keys(&disk_key, new_key) <= 0)
+                        return -1;
+        }
+        btrfs_cpu_key_to_disk(&disk_key, new_key);
+        btrfs_set_item_key(eb, &disk_key, slot);
+        btrfs_mark_buffer_dirty(eb);
+        if (slot == 0)
+                fixup_low_keys(trans, root, path, &disk_key, 1);
+        return 0;
+}
+/*
+ * try to push data from one node into the next node left in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
+ */
+static int push_node_left(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct extent_buffer *dst,
+                          struct extent_buffer *src, int empty)
+{
+        int push_items = 0;
+        int src_nritems;
+        int dst_nritems;
+        int ret = 0;
+        src_nritems = btrfs_header_nritems(src);
+        dst_nritems = btrfs_header_nritems(dst);
+        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+        WARN_ON(btrfs_header_generation(src) != trans->transid);
+        WARN_ON(btrfs_header_generation(dst) != trans->transid);
+        if (!empty && src_nritems <= 8)
+                return 1;
+        if (push_items <= 0)
+                return 1;
+        if (empty) {
+                push_items = min(src_nritems, push_items);
+                if (push_items < src_nritems) {
+                        /* leave at least 8 pointers in the node if
+                         * we aren't going to empty it
+                         */
+                        if (src_nritems - push_items < 8) {
+                                if (push_items <= 8)
+                                        return 1;
+                                push_items -= 8;
+                        }
+                }
+        } else
+                push_items = min(src_nritems - 8, push_items);
+        copy_extent_buffer(dst, src,
+                           btrfs_node_key_ptr_offset(dst_nritems),
+                           btrfs_node_key_ptr_offset(0),
+                           push_items * sizeof(struct btrfs_key_ptr));
+        if (push_items < src_nritems) {
+                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+                                      btrfs_node_key_ptr_offset(push_items),
+                                      (src_nritems - push_items) *
+                                      sizeof(struct btrfs_key_ptr));
+        }
+        btrfs_set_header_nritems(src, src_nritems - push_items);
+        btrfs_set_header_nritems(dst, dst_nritems + push_items);
+        btrfs_mark_buffer_dirty(src);
+        btrfs_mark_buffer_dirty(dst);
+        ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
+        BUG_ON(ret);
+        return ret;
+}
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will  only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct extent_buffer *dst,
+                              struct extent_buffer *src)
+{
+        int push_items = 0;
+        int max_push;
+        int src_nritems;
+        int dst_nritems;
+        int ret = 0;
+        WARN_ON(btrfs_header_generation(src) != trans->transid);
+        WARN_ON(btrfs_header_generation(dst) != trans->transid);
+        src_nritems = btrfs_header_nritems(src);
+        dst_nritems = btrfs_header_nritems(dst);
+        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+        if (push_items <= 0)
+                return 1;
+        if (src_nritems < 4)
+                return 1;
+        max_push = src_nritems / 2 + 1;
+        /* don't try to empty the node */
+        if (max_push >= src_nritems)
+                return 1;
+        if (max_push < push_items)
+                push_items = max_push;
+        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+                                      btrfs_node_key_ptr_offset(0),
+                                      (dst_nritems) *
+                                      sizeof(struct btrfs_key_ptr));
+        copy_extent_buffer(dst, src,
+                           btrfs_node_key_ptr_offset(0),
+                           btrfs_node_key_ptr_offset(src_nritems - push_items),
+                           push_items * sizeof(struct btrfs_key_ptr));
+        btrfs_set_header_nritems(src, src_nritems - push_items);
+        btrfs_set_header_nritems(dst, dst_nritems + push_items);
+        btrfs_mark_buffer_dirty(src);
+        btrfs_mark_buffer_dirty(dst);
+        ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
+        BUG_ON(ret);
+        return ret;
+}
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
+ */
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_path *path, int level)
+{
+        u64 lower_gen;
+        struct extent_buffer *lower;
+        struct extent_buffer *c;
+        struct extent_buffer *old;
+        struct btrfs_disk_key lower_key;
+        int ret;
+        BUG_ON(path->nodes[level]);
+        BUG_ON(path->nodes[level-1] != root->node);
+        lower = path->nodes[level-1];
+        if (level == 1)
+                btrfs_item_key(lower, &lower_key, 0);
+        else
+                btrfs_node_key(lower, &lower_key, 0);
+        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+                                   root->root_key.objectid, trans->transid,
+                                   level, root->node->start, 0);
+        if (IS_ERR(c))
+                return PTR_ERR(c);
+        memset_extent_buffer(c, 0, 0, root->nodesize);
+        btrfs_set_header_nritems(c, 1);
+        btrfs_set_header_level(c, level);
+        btrfs_set_header_bytenr(c, c->start);
+        btrfs_set_header_generation(c, trans->transid);
+        btrfs_set_header_owner(c, root->root_key.objectid);
+        write_extent_buffer(c, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(c),
+                            BTRFS_FSID_SIZE);
+        write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+                            (unsigned long)btrfs_header_chunk_tree_uuid(c),
+                            BTRFS_UUID_SIZE);
+        btrfs_set_node_key(c, &lower_key, 0);
+        btrfs_set_node_blockptr(c, 0, lower->start);
+        lower_gen = btrfs_header_generation(lower);
+        WARN_ON(lower_gen != trans->transid);
+        btrfs_set_node_ptr_generation(c, 0, lower_gen);
+        btrfs_mark_buffer_dirty(c);
+        spin_lock(&root->node_lock);
+        old = root->node;
+        root->node = c;
+        spin_unlock(&root->node_lock);
+        ret = btrfs_update_extent_ref(trans, root, lower->start,
+                                      lower->start, c->start,
+                                      root->root_key.objectid,
+                                      trans->transid, level - 1);
+        BUG_ON(ret);
+        /* the super has an extra ref to root->node */
+        free_extent_buffer(old);
+        add_root_to_dirty_list(root);
+        extent_buffer_get(c);
+        path->nodes[level] = c;
+        path->locks[level] = 1;
+        path->slots[level] = 0;
+        return 0;
+}
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ *
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ *
+ * returns zero on success and < 0 on any error
+ */
+static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_path *path, struct btrfs_disk_key
+                      *key, u64 bytenr, int slot, int level)
+{
+        struct extent_buffer *lower;
+        int nritems;
+        BUG_ON(!path->nodes[level]);
+        lower = path->nodes[level];
+        nritems = btrfs_header_nritems(lower);
+        if (slot > nritems)
+                BUG();
+        if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
+                BUG();
+        if (slot != nritems) {
+                memmove_extent_buffer(lower,
+                              btrfs_node_key_ptr_offset(slot + 1),
+                              btrfs_node_key_ptr_offset(slot),
+                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
+        }
+        btrfs_set_node_key(lower, key, slot);
+        btrfs_set_node_blockptr(lower, slot, bytenr);
+        WARN_ON(trans->transid == 0);
+        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
+        btrfs_set_header_nritems(lower, nritems + 1);
+        btrfs_mark_buffer_dirty(lower);
+        return 0;
+}
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
+ */
+static noinline int split_node(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_path *path, int level)
+{
+        struct extent_buffer *c;
+        struct extent_buffer *split;
+        struct btrfs_disk_key disk_key;
+        int mid;
+        int ret;
+        int wret;
+        u32 c_nritems;
+        c = path->nodes[level];
+        WARN_ON(btrfs_header_generation(c) != trans->transid);
+        if (c == root->node) {
+                /* trying to split the root, lets make a new one */
+                ret = insert_new_root(trans, root, path, level + 1);
+                if (ret)
+                        return ret;
+        } else {
+                ret = push_nodes_for_insert(trans, root, path, level);
+                c = path->nodes[level];
+                if (!ret && btrfs_header_nritems(c) <
+                    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
+                        return 0;
+                if (ret < 0)
+                        return ret;
+        }
+        c_nritems = btrfs_header_nritems(c);
+        split = btrfs_alloc_free_block(trans, root, root->nodesize,
+                                        path->nodes[level + 1]->start,
+                                        root->root_key.objectid,
+                                        trans->transid, level, c->start, 0);
+        if (IS_ERR(split))
+                return PTR_ERR(split);
+        btrfs_set_header_flags(split, btrfs_header_flags(c));
+        btrfs_set_header_level(split, btrfs_header_level(c));
+        btrfs_set_header_bytenr(split, split->start);
+        btrfs_set_header_generation(split, trans->transid);
+        btrfs_set_header_owner(split, root->root_key.objectid);
+        btrfs_set_header_flags(split, 0);
+        write_extent_buffer(split, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(split),
+                            BTRFS_FSID_SIZE);
+        write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+                            (unsigned long)btrfs_header_chunk_tree_uuid(split),
+                            BTRFS_UUID_SIZE);
+        mid = (c_nritems + 1) / 2;
+        copy_extent_buffer(split, c,
+                           btrfs_node_key_ptr_offset(0),
+                           btrfs_node_key_ptr_offset(mid),
+                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+        btrfs_set_header_nritems(split, c_nritems - mid);
+        btrfs_set_header_nritems(c, mid);
+        ret = 0;
+        btrfs_mark_buffer_dirty(c);
+        btrfs_mark_buffer_dirty(split);
+        btrfs_node_key(split, &disk_key, 0);
+        wret = insert_ptr(trans, root, path, &disk_key, split->start,
+                          path->slots[level + 1] + 1,
+                          level + 1);
+        if (wret)
+                ret = wret;
+        ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
+        BUG_ON(ret);
+        if (path->slots[level] >= mid) {
+                path->slots[level] -= mid;
+                btrfs_tree_unlock(c);
+                free_extent_buffer(c);
+                path->nodes[level] = split;
+                path->slots[level + 1] += 1;
+        } else {
+                btrfs_tree_unlock(split);
+                free_extent_buffer(split);
+        }
+        return ret;
+}
+/*
+ * how many bytes are required to store the items in a leaf.  start
+ * and nr indicate which items in the leaf to check.  This totals up the
+ * space used both by the item structs and the item data
+ */
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
+{
+        int data_len;
+        int nritems = btrfs_header_nritems(l);
+        int end = min(nritems, start + nr) - 1;
+        if (!nr)
+                return 0;
+        data_len = btrfs_item_end_nr(l, start);
+        data_len = data_len - btrfs_item_offset_nr(l, end);
+        data_len += sizeof(struct btrfs_item) * nr;
+        WARN_ON(data_len < 0);
+        return data_len;
+}
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data.  IOW, how much room
+ * the leaf has left for both items and data
+ */
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
+                                   struct extent_buffer *leaf)
+{
+        int nritems = btrfs_header_nritems(leaf);
+        int ret;
+        ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+        if (ret < 0) {
+                printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+                       "used %d nritems %d\n",
+                       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
+                       leaf_space_used(leaf, 0, nritems), nritems);
+        }
+        return ret;
+}
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+                           *root, struct btrfs_path *path, int data_size,
+                           int empty)
+{
+        struct extent_buffer *left = path->nodes[0];
+        struct extent_buffer *right;
+        struct extent_buffer *upper;
+        struct btrfs_disk_key disk_key;
+        int slot;
+        u32 i;
+        int free_space;
+        int push_space = 0;
+        int push_items = 0;
+        struct btrfs_item *item;
+        u32 left_nritems;
+        u32 nr;
+        u32 right_nritems;
+        u32 data_end;
+        u32 this_item_size;
+        int ret;
+        slot = path->slots[1];
+        if (!path->nodes[1])
+                return 1;
+        upper = path->nodes[1];
+        if (slot >= btrfs_header_nritems(upper) - 1)
+                return 1;
+        WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+        right = read_node_slot(root, upper, slot + 1);
+        btrfs_tree_lock(right);
+        free_space = btrfs_leaf_free_space(root, right);
+        if (free_space < data_size)
+                goto out_unlock;
+        /* cow and double check */
+        ret = btrfs_cow_block(trans, root, right, upper,
+                              slot + 1, &right, 0);
+        if (ret)
+                goto out_unlock;
+        free_space = btrfs_leaf_free_space(root, right);
+        if (free_space < data_size)
+                goto out_unlock;
+        left_nritems = btrfs_header_nritems(left);
+        if (left_nritems == 0)
+                goto out_unlock;
+        if (empty)
+                nr = 0;
+        else
+                nr = 1;
+        if (path->slots[0] >= left_nritems)
+                push_space += data_size;
+        i = left_nritems - 1;
+        while (i >= nr) {
+                item = btrfs_item_nr(left, i);
+                if (!empty && push_items > 0) {
+                        if (path->slots[0] > i)
+                                break;
+                        if (path->slots[0] == i) {
+                                int space = btrfs_leaf_free_space(root, left);
+                                if (space + push_space * 2 > free_space)
+                                        break;
+                        }
+                }
+                if (path->slots[0] == i)
+                        push_space += data_size;
+                if (!left->map_token) {
+                        map_extent_buffer(left, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &left->map_token, &left->kaddr,
+                                        &left->map_start, &left->map_len,
+                                        KM_USER1);
+                }
+                this_item_size = btrfs_item_size(left, item);
+                if (this_item_size + sizeof(*item) + push_space > free_space)
+                        break;
+                push_items++;
+                push_space += this_item_size + sizeof(*item);
+                if (i == 0)
+                        break;
+                i--;
+        }
+        if (left->map_token) {
+                unmap_extent_buffer(left, left->map_token, KM_USER1);
+                left->map_token = NULL;
+        }
+        if (push_items == 0)
+                goto out_unlock;
+        if (!empty && push_items == left_nritems)
+                WARN_ON(1);
+        /* push left to right */
+        right_nritems = btrfs_header_nritems(right);
+        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
+        push_space -= leaf_data_end(root, left);
+        /* make room in the right data area */
+        data_end = leaf_data_end(root, right);
+        memmove_extent_buffer(right,
+                              btrfs_leaf_data(right) + data_end - push_space,
+                              btrfs_leaf_data(right) + data_end,
+                              BTRFS_LEAF_DATA_SIZE(root) - data_end);
+        /* copy from the left data area */
+        copy_extent_buffer(right, left, btrfs_leaf_data(right) +
+                     BTRFS_LEAF_DATA_SIZE(root) - push_space,
+                     btrfs_leaf_data(left) + leaf_data_end(root, left),
+                     push_space);
+        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+                              btrfs_item_nr_offset(0),
+                              right_nritems * sizeof(struct btrfs_item));
+        /* copy the items from left to right */
+        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+                   btrfs_item_nr_offset(left_nritems - push_items),
+                   push_items * sizeof(struct btrfs_item));
+        /* update the item pointers */
+        right_nritems += push_items;
+        btrfs_set_header_nritems(right, right_nritems);
+        push_space = BTRFS_LEAF_DATA_SIZE(root);
+        for (i = 0; i < right_nritems; i++) {
+                item = btrfs_item_nr(right, i);
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                push_space -= btrfs_item_size(right, item);
+                btrfs_set_item_offset(right, item, push_space);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        left_nritems -= push_items;
+        btrfs_set_header_nritems(left, left_nritems);
+        if (left_nritems)
+                btrfs_mark_buffer_dirty(left);
+        btrfs_mark_buffer_dirty(right);
+        ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
+        BUG_ON(ret);
+        btrfs_item_key(right, &disk_key, 0);
+        btrfs_set_node_key(upper, &disk_key, slot + 1);
+        btrfs_mark_buffer_dirty(upper);
+        /* then fixup the leaf pointer in the path */
+        if (path->slots[0] >= left_nritems) {
+                path->slots[0] -= left_nritems;
+                if (btrfs_header_nritems(path->nodes[0]) == 0)
+                        clean_tree_block(trans, root, path->nodes[0]);
+                btrfs_tree_unlock(path->nodes[0]);
+                free_extent_buffer(path->nodes[0]);
+                path->nodes[0] = right;
+                path->slots[1] += 1;
+        } else {
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        return 0;
+out_unlock:
+        btrfs_tree_unlock(right);
+        free_extent_buffer(right);
+        return 1;
+}
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, struct btrfs_path *path, int data_size,
+                          int empty)
+{
+        struct btrfs_disk_key disk_key;
+        struct extent_buffer *right = path->nodes[0];
+        struct extent_buffer *left;
+        int slot;
+        int i;
+        int free_space;
+        int push_space = 0;
+        int push_items = 0;
+        struct btrfs_item *item;
+        u32 old_left_nritems;
+        u32 right_nritems;
+        u32 nr;
+        int ret = 0;
+        int wret;
+        u32 this_item_size;
+        u32 old_left_item_size;
+        slot = path->slots[1];
+        if (slot == 0)
+                return 1;
+        if (!path->nodes[1])
+                return 1;
+        right_nritems = btrfs_header_nritems(right);
+        if (right_nritems == 0)
+                return 1;
+        WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+        left = read_node_slot(root, path->nodes[1], slot - 1);
+        btrfs_tree_lock(left);
+        free_space = btrfs_leaf_free_space(root, left);
+        if (free_space < data_size) {
+                ret = 1;
+                goto out;
+        }
+        /* cow and double check */
+        ret = btrfs_cow_block(trans, root, left,
+                              path->nodes[1], slot - 1, &left, 0);
+        if (ret) {
+                /* we hit -ENOSPC, but it isn't fatal here */
+                ret = 1;
+                goto out;
+        }
+        free_space = btrfs_leaf_free_space(root, left);
+        if (free_space < data_size) {
+                ret = 1;
+                goto out;
+        }
+        if (empty)
+                nr = right_nritems;
+        else
+                nr = right_nritems - 1;
+        for (i = 0; i < nr; i++) {
+                item = btrfs_item_nr(right, i);
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                if (!empty && push_items > 0) {
+                        if (path->slots[0] < i)
+                                break;
+                        if (path->slots[0] == i) {
+                                int space = btrfs_leaf_free_space(root, right);
+                                if (space + push_space * 2 > free_space)
+                                        break;
+                        }
+                }
+                if (path->slots[0] == i)
+                        push_space += data_size;
+                this_item_size = btrfs_item_size(right, item);
+                if (this_item_size + sizeof(*item) + push_space > free_space)
+                        break;
+                push_items++;
+                push_space += this_item_size + sizeof(*item);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        if (push_items == 0) {
+                ret = 1;
+                goto out;
+        }
+        if (!empty && push_items == btrfs_header_nritems(right))
+                WARN_ON(1);
+        /* push data from right to left */
+        copy_extent_buffer(left, right,
+                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
+                           btrfs_item_nr_offset(0),
+                           push_items * sizeof(struct btrfs_item));
+        push_space = BTRFS_LEAF_DATA_SIZE(root) -
+                     btrfs_item_offset_nr(right, push_items - 1);
+        copy_extent_buffer(left, right, btrfs_leaf_data(left) +
+                     leaf_data_end(root, left) - push_space,
+                     btrfs_leaf_data(right) +
+                     btrfs_item_offset_nr(right, push_items - 1),
+                     push_space);
+        old_left_nritems = btrfs_header_nritems(left);
+        BUG_ON(old_left_nritems <= 0);
+        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
+        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+                u32 ioff;
+                item = btrfs_item_nr(left, i);
+                if (!left->map_token) {
+                        map_extent_buffer(left, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &left->map_token, &left->kaddr,
+                                        &left->map_start, &left->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(left, item);
+                btrfs_set_item_offset(left, item,
+                      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+        }
+        btrfs_set_header_nritems(left, old_left_nritems + push_items);
+        if (left->map_token) {
+                unmap_extent_buffer(left, left->map_token, KM_USER1);
+                left->map_token = NULL;
+        }
+        /* fixup right node */
+        if (push_items > right_nritems) {
+                printk(KERN_CRIT "push items %d nr %u\n", push_items,
+                       right_nritems);
+                WARN_ON(1);
+        }
+        if (push_items < right_nritems) {
+                push_space = btrfs_item_offset_nr(right, push_items - 1) -
+                                                  leaf_data_end(root, right);
+                memmove_extent_buffer(right, btrfs_leaf_data(right) +
+                                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
+                                      btrfs_leaf_data(right) +
+                                      leaf_data_end(root, right), push_space);
+                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+                              btrfs_item_nr_offset(push_items),
+                             (btrfs_header_nritems(right) - push_items) *
+                             sizeof(struct btrfs_item));
+        }
+        right_nritems -= push_items;
+        btrfs_set_header_nritems(right, right_nritems);
+        push_space = BTRFS_LEAF_DATA_SIZE(root);
+        for (i = 0; i < right_nritems; i++) {
+                item = btrfs_item_nr(right, i);
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                push_space = push_space - btrfs_item_size(right, item);
+                btrfs_set_item_offset(right, item, push_space);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        btrfs_mark_buffer_dirty(left);
+        if (right_nritems)
+                btrfs_mark_buffer_dirty(right);
+        ret = btrfs_update_ref(trans, root, right, left,
+                               old_left_nritems, push_items);
+        BUG_ON(ret);
+        btrfs_item_key(right, &disk_key, 0);
+        wret = fixup_low_keys(trans, root, path, &disk_key, 1);
+        if (wret)
+                ret = wret;
+        /* then fixup the leaf pointer in the path */
+        if (path->slots[0] < push_items) {
+                path->slots[0] += old_left_nritems;
+                if (btrfs_header_nritems(path->nodes[0]) == 0)
+                        clean_tree_block(trans, root, path->nodes[0]);
+                btrfs_tree_unlock(path->nodes[0]);
+                free_extent_buffer(path->nodes[0]);
+                path->nodes[0] = left;
+                path->slots[1] -= 1;
+        } else {
+                btrfs_tree_unlock(left);
+                free_extent_buffer(left);
+                path->slots[0] -= push_items;
+        }
+        BUG_ON(path->slots[0] < 0);
+        return ret;
+out:
+        btrfs_tree_unlock(left);
+        free_extent_buffer(left);
+        return ret;
+}
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_key *ins_key,
+                               struct btrfs_path *path, int data_size,
+                               int extend)
+{
+        struct extent_buffer *l;
+        u32 nritems;
+        int mid;
+        int slot;
+        struct extent_buffer *right;
+        int data_copy_size;
+        int rt_data_off;
+        int i;
+        int ret = 0;
+        int wret;
+        int double_split;
+        int num_doubles = 0;
+        struct btrfs_disk_key disk_key;
+        /* first try to make some room by pushing left and right */
+        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+                wret = push_leaf_right(trans, root, path, data_size, 0);
+                if (wret < 0)
+                        return wret;
+                if (wret) {
+                        wret = push_leaf_left(trans, root, path, data_size, 0);
+                        if (wret < 0)
+                                return wret;
+                }
+                l = path->nodes[0];
+                /* did the pushes work? */
+                if (btrfs_leaf_free_space(root, l) >= data_size)
+                        return 0;
+        }
+        if (!path->nodes[1]) {
+                ret = insert_new_root(trans, root, path, 1);
+                if (ret)
+                        return ret;
+        }
+again:
+        double_split = 0;
+        l = path->nodes[0];
+        slot = path->slots[0];
+        nritems = btrfs_header_nritems(l);
+        mid = (nritems + 1) / 2;
+        right = btrfs_alloc_free_block(trans, root, root->leafsize,
+                                        path->nodes[1]->start,
+                                        root->root_key.objectid,
+                                        trans->transid, 0, l->start, 0);
+        if (IS_ERR(right)) {
+                BUG_ON(1);
+                return PTR_ERR(right);
+        }
+        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+        btrfs_set_header_bytenr(right, right->start);
+        btrfs_set_header_generation(right, trans->transid);
+        btrfs_set_header_owner(right, root->root_key.objectid);
+        btrfs_set_header_level(right, 0);
+        write_extent_buffer(right, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(right),
+                            BTRFS_FSID_SIZE);
+        write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
+                            BTRFS_UUID_SIZE);
+        if (mid <= slot) {
+                if (nritems == 1 ||
+                    leaf_space_used(l, mid, nritems - mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (slot >= nritems) {
+                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+                                btrfs_set_header_nritems(right, 0);
+                                wret = insert_ptr(trans, root, path,
+                                                  &disk_key, right->start,
+                                                  path->slots[1] + 1, 1);
+                                if (wret)
+                                        ret = wret;
+                                btrfs_tree_unlock(path->nodes[0]);
+                                free_extent_buffer(path->nodes[0]);
+                                path->nodes[0] = right;
+                                path->slots[0] = 0;
+                                path->slots[1] += 1;
+                                btrfs_mark_buffer_dirty(right);
+                                return ret;
+                        }
+                        mid = slot;
+                        if (mid != nritems &&
+                            leaf_space_used(l, mid, nritems - mid) +
+                            data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                double_split = 1;
+                        }
+                }
+        } else {
+                if (leaf_space_used(l, 0, mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (!extend && data_size && slot == 0) {
+                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+                                btrfs_set_header_nritems(right, 0);
+                                wret = insert_ptr(trans, root, path,
+                                                  &disk_key,
+                                                  right->start,
+                                                  path->slots[1], 1);
+                                if (wret)
+                                        ret = wret;
+                                btrfs_tree_unlock(path->nodes[0]);
+                                free_extent_buffer(path->nodes[0]);
+                                path->nodes[0] = right;
+                                path->slots[0] = 0;
+                                if (path->slots[1] == 0) {
+                                        wret = fixup_low_keys(trans, root,
+                                                      path, &disk_key, 1);
+                                        if (wret)
+                                                ret = wret;
+                                }
+                                btrfs_mark_buffer_dirty(right);
+                                return ret;
+                        } else if ((extend || !data_size) && slot == 0) {
+                                mid = 1;
+                        } else {
+                                mid = slot;
+                                if (mid != nritems &&
+                                    leaf_space_used(l, mid, nritems - mid) +
+                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        double_split = 1;
+                                }
+                        }
+                }
+        }
+        nritems = nritems - mid;
+        btrfs_set_header_nritems(right, nritems);
+        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+                           btrfs_item_nr_offset(mid),
+                           nritems * sizeof(struct btrfs_item));
+        copy_extent_buffer(right, l,
+                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+                     data_copy_size, btrfs_leaf_data(l) +
+                     leaf_data_end(root, l), data_copy_size);
+        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+                      btrfs_item_end_nr(l, mid);
+        for (i = 0; i < nritems; i++) {
+                struct btrfs_item *item = btrfs_item_nr(right, i);
+                u32 ioff;
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(right, item);
+                btrfs_set_item_offset(right, item, ioff + rt_data_off);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        btrfs_set_header_nritems(l, mid);
+        ret = 0;
+        btrfs_item_key(right, &disk_key, 0);
+        wret = insert_ptr(trans, root, path, &disk_key, right->start,
+                          path->slots[1] + 1, 1);
+        if (wret)
+                ret = wret;
+        btrfs_mark_buffer_dirty(right);
+        btrfs_mark_buffer_dirty(l);
+        BUG_ON(path->slots[0] != slot);
+        ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+        BUG_ON(ret);
+        if (mid <= slot) {
+                btrfs_tree_unlock(path->nodes[0]);
+                free_extent_buffer(path->nodes[0]);
+                path->nodes[0] = right;
+                path->slots[0] -= mid;
+                path->slots[1] += 1;
+        } else {
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        BUG_ON(path->slots[0] < 0);
+        if (double_split) {
+                BUG_ON(num_doubles != 0);
+                num_doubles++;
+                goto again;
+        }
+        return ret;
+}
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation.  After
+ * the split, the path is pointing to the old item.  The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_path *path,
+                     struct btrfs_key *new_key,
+                     unsigned long split_offset)
+{
+        u32 item_size;
+        struct extent_buffer *leaf;
+        struct btrfs_key orig_key;
+        struct btrfs_item *item;
+        struct btrfs_item *new_item;
+        int ret = 0;
+        int slot;
+        u32 nritems;
+        u32 orig_offset;
+        struct btrfs_disk_key disk_key;
+        char *buf;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
+        if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
+                goto split;
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        btrfs_release_path(root, path);
+        path->search_for_split = 1;
+        path->keep_locks = 1;
+        ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
+        path->search_for_split = 0;
+        /* if our item isn't there or got smaller, return now */
+        if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
+                                                        path->slots[0])) {
+                path->keep_locks = 0;
+                return -EAGAIN;
+        }
+        ret = split_leaf(trans, root, &orig_key, path,
+                         sizeof(struct btrfs_item), 1);
+        path->keep_locks = 0;
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+split:
+        item = btrfs_item_nr(leaf, path->slots[0]);
+        orig_offset = btrfs_item_offset(leaf, item);
+        item_size = btrfs_item_size(leaf, item);
+        buf = kmalloc(item_size, GFP_NOFS);
+        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+                            path->slots[0]), item_size);
+        slot = path->slots[0] + 1;
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        if (slot != nritems) {
+                /* shift the items */
+                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+                              btrfs_item_nr_offset(slot),
+                              (nritems - slot) * sizeof(struct btrfs_item));
+        }
+        btrfs_cpu_key_to_disk(&disk_key, new_key);
+        btrfs_set_item_key(leaf, &disk_key, slot);
+        new_item = btrfs_item_nr(leaf, slot);
+        btrfs_set_item_offset(leaf, new_item, orig_offset);
+        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+        btrfs_set_item_offset(leaf, item,
+                              orig_offset + item_size - split_offset);
+        btrfs_set_item_size(leaf, item, split_offset);
+        btrfs_set_header_nritems(leaf, nritems + 1);
+        /* write the data for the start of the original item */
+        write_extent_buffer(leaf, buf,
+                            btrfs_item_ptr_offset(leaf, path->slots[0]),
+                            split_offset);
+        /* write the data for the new item */
+        write_extent_buffer(leaf, buf + split_offset,
+                            btrfs_item_ptr_offset(leaf, slot),
+                            item_size - split_offset);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+        kfree(buf);
+        return ret;
+}
+/*
+ * make the item pointed to by the path smaller.  new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        u32 new_size, int from_end)
+{
+        int ret = 0;
+        int slot;
+        int slot_orig;
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        u32 nritems;
+        unsigned int data_end;
+        unsigned int old_data_start;
+        unsigned int old_size;
+        unsigned int size_diff;
+        int i;
+        slot_orig = path->slots[0];
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        old_size = btrfs_item_size_nr(leaf, slot);
+        if (old_size == new_size)
+                return 0;
+        nritems = btrfs_header_nritems(leaf);
+        data_end = leaf_data_end(root, leaf);
+        old_data_start = btrfs_item_offset_nr(leaf, slot);
+        size_diff = old_size - new_size;
+        BUG_ON(slot < 0);
+        BUG_ON(slot >= nritems);
+        /*
+         * item0..itemN ... dataN.offset..dataN.size .. data0.size
+         */
+        /* first correct the data pointers */
+        for (i = slot; i < nritems; i++) {
+                u32 ioff;
+                item = btrfs_item_nr(leaf, i);
+                if (!leaf->map_token) {
+                        map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(leaf, item);
+                btrfs_set_item_offset(leaf, item, ioff + size_diff);
+        }
+        if (leaf->map_token) {
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
+        /* shift the data */
+        if (from_end) {
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end + size_diff, btrfs_leaf_data(leaf) +
+                              data_end, old_data_start + new_size - data_end);
+        } else {
+                struct btrfs_disk_key disk_key;
+                u64 offset;
+                btrfs_item_key(leaf, &disk_key, slot);
+                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+                        unsigned long ptr;
+                        struct btrfs_file_extent_item *fi;
+                        fi = btrfs_item_ptr(leaf, slot,
+                                            struct btrfs_file_extent_item);
+                        fi = (struct btrfs_file_extent_item *)(
+                             (unsigned long)fi - size_diff);
+                        if (btrfs_file_extent_type(leaf, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE) {
+                                ptr = btrfs_item_ptr_offset(leaf, slot);
+                                memmove_extent_buffer(leaf, ptr,
+                                      (unsigned long)fi,
+                                      offsetof(struct btrfs_file_extent_item,
+                                                 disk_bytenr));
+                        }
+                }
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end + size_diff, btrfs_leaf_data(leaf) +
+                              data_end, old_data_start - data_end);
+                offset = btrfs_disk_key_offset(&disk_key);
+                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+                btrfs_set_item_key(leaf, &disk_key, slot);
+                if (slot == 0)
+                        fixup_low_keys(trans, root, path, &disk_key, 1);
+        }
+        item = btrfs_item_nr(leaf, slot);
+        btrfs_set_item_size(leaf, item, new_size);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+        return ret;
+}
+/*
+ * make the item pointed to by the path bigger, data_size is the new size.
+ */
+int btrfs_extend_item(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, struct btrfs_path *path,
+                      u32 data_size)
+{
+        int ret = 0;
+        int slot;
+        int slot_orig;
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        u32 nritems;
+        unsigned int data_end;
+        unsigned int old_data;
+        unsigned int old_size;
+        int i;
+        slot_orig = path->slots[0];
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        data_end = leaf_data_end(root, leaf);
+        if (btrfs_leaf_free_space(root, leaf) < data_size) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+        slot = path->slots[0];
+        old_data = btrfs_item_end_nr(leaf, slot);
+        BUG_ON(slot < 0);
+        if (slot >= nritems) {
+                btrfs_print_leaf(root, leaf);
+                printk(KERN_CRIT "slot %d too large, nritems %d\n",
+                       slot, nritems);
+                BUG_ON(1);
+        }
+        /*
+         * item0..itemN ... dataN.offset..dataN.size .. data0.size
+         */
+        /* first correct the data pointers */
+        for (i = slot; i < nritems; i++) {
+                u32 ioff;
+                item = btrfs_item_nr(leaf, i);
+                if (!leaf->map_token) {
+                        map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(leaf, item);
+                btrfs_set_item_offset(leaf, item, ioff - data_size);
+        }
+        if (leaf->map_token) {
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
+        /* shift the data */
+        memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                      data_end - data_size, btrfs_leaf_data(leaf) +
+                      data_end, old_data - data_end);
+        data_end = old_data;
+        old_size = btrfs_item_size_nr(leaf, slot);
+        item = btrfs_item_nr(leaf, slot);
+        btrfs_set_item_size(leaf, item, old_size + data_size);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+        return ret;
+}
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ * Returns the number of keys that were inserted.
+ */
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct btrfs_key *cpu_key, u32 *data_size,
+                            int nr)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        int ret = 0;
+        int slot;
+        int i;
+        u32 nritems;
+        u32 total_data = 0;
+        u32 total_size = 0;
+        unsigned int data_end;
+        struct btrfs_disk_key disk_key;
+        struct btrfs_key found_key;
+        for (i = 0; i < nr; i++) {
+                if (total_size + data_size[i] + sizeof(struct btrfs_item) >
+                    BTRFS_LEAF_DATA_SIZE(root)) {
+                        break;
+                        nr = i;
+                }
+                total_data += data_size[i];
+                total_size += data_size[i] + sizeof(struct btrfs_item);
+        }
+        BUG_ON(nr == 0);
+        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+        if (ret == 0)
+                return -EEXIST;
+        if (ret < 0)
+                goto out;
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        data_end = leaf_data_end(root, leaf);
+        if (btrfs_leaf_free_space(root, leaf) < total_size) {
+                for (i = nr; i >= 0; i--) {
+                        total_data -= data_size[i];
+                        total_size -= data_size[i] + sizeof(struct btrfs_item);
+                        if (total_size < btrfs_leaf_free_space(root, leaf))
+                                break;
+                }
+                nr = i;
+        }
+        slot = path->slots[0];
+        BUG_ON(slot < 0);
+        if (slot != nritems) {
+                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+                item = btrfs_item_nr(leaf, slot);
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                /* figure out how many keys we can insert in here */
+                total_data = data_size[0];
+                for (i = 1; i < nr; i++) {
+                        if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+                                break;
+                        total_data += data_size[i];
+                }
+                nr = i;
+                if (old_data < data_end) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+                               slot, old_data, data_end);
+                        BUG_ON(1);
+                }
+                /*
+                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+                 */
+                /* first correct the data pointers */
+                WARN_ON(leaf->map_token);
+                for (i = slot; i < nritems; i++) {
+                        u32 ioff;
+                        item = btrfs_item_nr(leaf, i);
+                        if (!leaf->map_token) {
+                                map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                        }
+                        ioff = btrfs_item_offset(leaf, item);
+                        btrfs_set_item_offset(leaf, item, ioff - total_data);
+                }
+                if (leaf->map_token) {
+                        unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                        leaf->map_token = NULL;
+                }
+                /* shift the items */
+                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+                              btrfs_item_nr_offset(slot),
+                              (nritems - slot) * sizeof(struct btrfs_item));
+                /* shift the data */
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end - total_data, btrfs_leaf_data(leaf) +
+                              data_end, old_data - data_end);
+                data_end = old_data;
+        } else {
+                /*
+                 * this sucks but it has to be done, if we are inserting at
+                 * the end of the leaf only insert 1 of the items, since we
+                 * have no way of knowing whats on the next leaf and we'd have
+                 * to drop our current locks to figure it out
+                 */
+                nr = 1;
+        }
+        /* setup the item for the new data */
+        for (i = 0; i < nr; i++) {
+                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+                btrfs_set_item_key(leaf, &disk_key, slot + i);
+                item = btrfs_item_nr(leaf, slot + i);
+                btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+                data_end -= data_size[i];
+                btrfs_set_item_size(leaf, item, data_size[i]);
+        }
+        btrfs_set_header_nritems(leaf, nritems + nr);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (slot == 0) {
+                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+                ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+        }
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+out:
+        if (!ret)
+                ret = nr;
+        return ret;
+}
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct btrfs_key *cpu_key, u32 *data_size,
+                            int nr)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        int ret = 0;
+        int slot;
+        int slot_orig;
+        int i;
+        u32 nritems;
+        u32 total_size = 0;
+        u32 total_data = 0;
+        unsigned int data_end;
+        struct btrfs_disk_key disk_key;
+        for (i = 0; i < nr; i++)
+                total_data += data_size[i];
+        total_size = total_data + (nr * sizeof(struct btrfs_item));
+        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+        if (ret == 0)
+                return -EEXIST;
+        if (ret < 0)
+                goto out;
+        slot_orig = path->slots[0];
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        data_end = leaf_data_end(root, leaf);
+        if (btrfs_leaf_free_space(root, leaf) < total_size) {
+                btrfs_print_leaf(root, leaf);
+                printk(KERN_CRIT "not enough freespace need %u have %d\n",
+                       total_size, btrfs_leaf_free_space(root, leaf));
+                BUG();
+        }
+        slot = path->slots[0];
+        BUG_ON(slot < 0);
+        if (slot != nritems) {
+                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+                if (old_data < data_end) {
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+                               slot, old_data, data_end);
+                        BUG_ON(1);
+                }
+                /*
+                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+                 */
+                /* first correct the data pointers */
+                WARN_ON(leaf->map_token);
+                for (i = slot; i < nritems; i++) {
+                        u32 ioff;
+                        item = btrfs_item_nr(leaf, i);
+                        if (!leaf->map_token) {
+                                map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                        }
+                        ioff = btrfs_item_offset(leaf, item);
+                        btrfs_set_item_offset(leaf, item, ioff - total_data);
+                }
+                if (leaf->map_token) {
+                        unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                        leaf->map_token = NULL;
+                }
+                /* shift the items */
+                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+                              btrfs_item_nr_offset(slot),
+                              (nritems - slot) * sizeof(struct btrfs_item));
+                /* shift the data */
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end - total_data, btrfs_leaf_data(leaf) +
+                              data_end, old_data - data_end);
+                data_end = old_data;
+        }
+        /* setup the item for the new data */
+        for (i = 0; i < nr; i++) {
+                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+                btrfs_set_item_key(leaf, &disk_key, slot + i);
+                item = btrfs_item_nr(leaf, slot + i);
+                btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+                data_end -= data_size[i];
+                btrfs_set_item_size(leaf, item, data_size[i]);
+        }
+        btrfs_set_header_nritems(leaf, nritems + nr);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+        if (slot == 0) {
+                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+                ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+        }
+        if (btrfs_leaf_free_space(root, leaf) < 0) {
+                btrfs_print_leaf(root, leaf);
+                BUG();
+        }
+out:
+        return ret;
+}
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *cpu_key, void *data, u32
+                      data_size)
+{
+        int ret = 0;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        unsigned long ptr;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+        if (!ret) {
+                leaf = path->nodes[0];
+                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                write_extent_buffer(leaf, data, ptr, data_size);
+                btrfs_mark_buffer_dirty(leaf);
+        }
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * delete the pointer from a given node.
+ *
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
+ */
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_path *path, int level, int slot)
+{
+        struct extent_buffer *parent = path->nodes[level];
+        u32 nritems;
+        int ret = 0;
+        int wret;
+        nritems = btrfs_header_nritems(parent);
+        if (slot != nritems - 1) {
+                memmove_extent_buffer(parent,
+                              btrfs_node_key_ptr_offset(slot),
+                              btrfs_node_key_ptr_offset(slot + 1),
+                              sizeof(struct btrfs_key_ptr) *
+                              (nritems - slot - 1));
+        }
+        nritems--;
+        btrfs_set_header_nritems(parent, nritems);
+        if (nritems == 0 && parent == root->node) {
+                BUG_ON(btrfs_header_level(root->node) != 1);
+                /* just turn the root into a leaf and break */
+                btrfs_set_header_level(root->node, 0);
+        } else if (slot == 0) {
+                struct btrfs_disk_key disk_key;
+                btrfs_node_key(parent, &disk_key, 0);
+                wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
+                if (wret)
+                        ret = wret;
+        }
+        btrfs_mark_buffer_dirty(parent);
+        return ret;
+}
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].  bytenr is the node block pointer, but since the callers
+ * already know it, it is faster to have them pass it down than to
+ * read it out of the node again.
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent.  zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing.  path->nodes[1] must be locked.
+ */
+noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path, u64 bytenr)
+{
+        int ret;
+        u64 root_gen = btrfs_header_generation(path->nodes[1]);
+        ret = del_ptr(trans, root, path, 1, path->slots[1]);
+        if (ret)
+                return ret;
+        ret = btrfs_free_extent(trans, root, bytenr,
+                                btrfs_level_size(root, 0),
+                                path->nodes[1]->start,
+                                btrfs_header_owner(path->nodes[1]),
+                                root_gen, 0, 1);
+        return ret;
+}
+/*
+ * delete the item at the leaf level in path.  If that empties
+ * the leaf, remove it from the tree
+ */
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                    struct btrfs_path *path, int slot, int nr)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        int last_off;
+        int dsize = 0;
+        int ret = 0;
+        int wret;
+        int i;
+        u32 nritems;
+        leaf = path->nodes[0];
+        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+        for (i = 0; i < nr; i++)
+                dsize += btrfs_item_size_nr(leaf, slot + i);
+        nritems = btrfs_header_nritems(leaf);
+        if (slot + nr != nritems) {
+                int data_end = leaf_data_end(root, leaf);
+                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+                              data_end + dsize,
+                              btrfs_leaf_data(leaf) + data_end,
+                              last_off - data_end);
+                for (i = slot + nr; i < nritems; i++) {
+                        u32 ioff;
+                        item = btrfs_item_nr(leaf, i);
+                        if (!leaf->map_token) {
+                                map_extent_buffer(leaf, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &leaf->map_token, &leaf->kaddr,
+                                        &leaf->map_start, &leaf->map_len,
+                                        KM_USER1);
+                        }
+                        ioff = btrfs_item_offset(leaf, item);
+                        btrfs_set_item_offset(leaf, item, ioff + dsize);
+                }
+                if (leaf->map_token) {
+                        unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                        leaf->map_token = NULL;
+                }
+                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+                              btrfs_item_nr_offset(slot + nr),
+                              sizeof(struct btrfs_item) *
+                              (nritems - slot - nr));
+        }
+        btrfs_set_header_nritems(leaf, nritems - nr);
+        nritems -= nr;
+        /* delete the leaf if we've emptied it */
+        if (nritems == 0) {
+                if (leaf == root->node) {
+                        btrfs_set_header_level(leaf, 0);
+                } else {
+                        ret = btrfs_del_leaf(trans, root, path, leaf->start);
+                        BUG_ON(ret);
+                }
+        } else {
+                int used = leaf_space_used(leaf, 0, nritems);
+                if (slot == 0) {
+                        struct btrfs_disk_key disk_key;
+                        btrfs_item_key(leaf, &disk_key, 0);
+                        wret = fixup_low_keys(trans, root, path,
+                                              &disk_key, 1);
+                        if (wret)
+                                ret = wret;
+                }
+                /* delete the leaf if it is mostly empty */
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+                        /* push_leaf_left fixes the path.
+                         * make sure the path still points to our leaf
+                         * for possible call to del_ptr below
+                         */
+                        slot = path->slots[1];
+                        extent_buffer_get(leaf);
+                        wret = push_leaf_left(trans, root, path, 1, 1);
+                        if (wret < 0 && wret != -ENOSPC)
+                                ret = wret;
+                        if (path->nodes[0] == leaf &&
+                            btrfs_header_nritems(leaf)) {
+                                wret = push_leaf_right(trans, root, path, 1, 1);
+                                if (wret < 0 && wret != -ENOSPC)
+                                        ret = wret;
+                        }
+                        if (btrfs_header_nritems(leaf) == 0) {
+                                path->slots[1] = slot;
+                                ret = btrfs_del_leaf(trans, root, path,
+                                                     leaf->start);
+                                BUG_ON(ret);
+                                free_extent_buffer(leaf);
+                        } else {
+                                /* if we're still in the path, make sure
+                                 * we're dirty.  Otherwise, one of the
+                                 * push_leaf functions must have already
+                                 * dirtied this buffer
+                                 */
+                                if (path->nodes[0] == leaf)
+                                        btrfs_mark_buffer_dirty(leaf);
+                                free_extent_buffer(leaf);
+                        }
+                } else {
+                        btrfs_mark_buffer_dirty(leaf);
+                }
+        }
+        return ret;
+}
+/*
+ * search the tree again to find a leaf with lesser keys
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        struct btrfs_disk_key found_key;
+        int ret;
+        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+        if (key.offset > 0)
+                key.offset--;
+        else if (key.type > 0)
+                key.type--;
+        else if (key.objectid > 0)
+                key.objectid--;
+        else
+                return 1;
+        btrfs_release_path(root, path);
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                return ret;
+        btrfs_item_key(path->nodes[0], &found_key, 0);
+        ret = comp_keys(&found_key, &key);
+        if (ret < 0)
+                return 0;
+        return 1;
+}
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are either in cache or have a minimum
+ * transaction id.  This is used by the btree defrag code, and tree logging
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through.  Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+                         struct btrfs_key *max_key,
+                         struct btrfs_path *path, int cache_only,
+                         u64 min_trans)
+{
+        struct extent_buffer *cur;
+        struct btrfs_key found_key;
+        int slot;
+        int sret;
+        u32 nritems;
+        int level;
+        int ret = 1;
+        WARN_ON(!path->keep_locks);
+again:
+        cur = btrfs_lock_root_node(root);
+        level = btrfs_header_level(cur);
+        WARN_ON(path->nodes[level]);
+        path->nodes[level] = cur;
+        path->locks[level] = 1;
+        if (btrfs_header_generation(cur) < min_trans) {
+                ret = 1;
+                goto out;
+        }
+        while (1) {
+                nritems = btrfs_header_nritems(cur);
+                level = btrfs_header_level(cur);
+                sret = bin_search(cur, min_key, level, &slot);
+                /* at the lowest level, we're done, setup the path and exit */
+                if (level == path->lowest_level) {
+                        if (slot >= nritems)
+                                goto find_next_key;
+                        ret = 0;
+                        path->slots[level] = slot;
+                        btrfs_item_key_to_cpu(cur, &found_key, slot);
+                        goto out;
+                }
+                if (sret && slot > 0)
+                        slot--;
+                /*
+                 * check this node pointer against the cache_only and
+                 * min_trans parameters.  If it isn't in cache or is too
+                 * old, skip to the next one.
+                 */
+                while (slot < nritems) {
+                        u64 blockptr;
+                        u64 gen;
+                        struct extent_buffer *tmp;
+                        struct btrfs_disk_key disk_key;
+                        blockptr = btrfs_node_blockptr(cur, slot);
+                        gen = btrfs_node_ptr_generation(cur, slot);
+                        if (gen < min_trans) {
+                                slot++;
+                                continue;
+                        }
+                        if (!cache_only)
+                                break;
+                        if (max_key) {
+                                btrfs_node_key(cur, &disk_key, slot);
+                                if (comp_keys(&disk_key, max_key) >= 0) {
+                                        ret = 1;
+                                        goto out;
+                                }
+                        }
+                        tmp = btrfs_find_tree_block(root, blockptr,
+                                            btrfs_level_size(root, level - 1));
+                        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+                                free_extent_buffer(tmp);
+                                break;
+                        }
+                        if (tmp)
+                                free_extent_buffer(tmp);
+                        slot++;
+                }
+find_next_key:
+                /*
+                 * we didn't find a candidate key in this node, walk forward
+                 * and find another one
+                 */
+                if (slot >= nritems) {
+                        path->slots[level] = slot;
+                        sret = btrfs_find_next_key(root, path, min_key, level,
+                                                  cache_only, min_trans);
+                        if (sret == 0) {
+                                btrfs_release_path(root, path);
+                                goto again;
+                        } else {
+                                goto out;
+                        }
+                }
+                /* save our key for returning back */
+                btrfs_node_key_to_cpu(cur, &found_key, slot);
+                path->slots[level] = slot;
+                if (level == path->lowest_level) {
+                        ret = 0;
+                        unlock_up(path, level, 1);
+                        goto out;
+                }
+                cur = read_node_slot(root, cur, slot);
+                btrfs_tree_lock(cur);
+                path->locks[level - 1] = 1;
+                path->nodes[level - 1] = cur;
+                unlock_up(path, level, 1);
+        }
+out:
+        if (ret == 0)
+                memcpy(min_key, &found_key, sizeof(found_key));
+        return ret;
+}
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path.  It looks for and returns the next key in the
+ * tree based on the current path and the cache_only and min_trans
+ * parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+                        struct btrfs_key *key, int lowest_level,
+                        int cache_only, u64 min_trans)
+{
+        int level = lowest_level;
+        int slot;
+        struct extent_buffer *c;
+        WARN_ON(!path->keep_locks);
+        while (level < BTRFS_MAX_LEVEL) {
+                if (!path->nodes[level])
+                        return 1;
+                slot = path->slots[level] + 1;
+                c = path->nodes[level];
+next:
+                if (slot >= btrfs_header_nritems(c)) {
+                        level++;
+                        if (level == BTRFS_MAX_LEVEL)
+                                return 1;
+                        continue;
+                }
+                if (level == 0)
+                        btrfs_item_key_to_cpu(c, key, slot);
+                else {
+                        u64 blockptr = btrfs_node_blockptr(c, slot);
+                        u64 gen = btrfs_node_ptr_generation(c, slot);
+                        if (cache_only) {
+                                struct extent_buffer *cur;
+                                cur = btrfs_find_tree_block(root, blockptr,
+                                            btrfs_level_size(root, level - 1));
+                                if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
+                                        slot++;
+                                        if (cur)
+                                                free_extent_buffer(cur);
+                                        goto next;
+                                }
+                                free_extent_buffer(cur);
+                        }
+                        if (gen < min_trans) {
+                                slot++;
+                                goto next;
+                        }
+                        btrfs_node_key_to_cpu(c, key, slot);
+                }
+                return 0;
+        }
+        return 1;
+}
+/*
+ * search the tree again to find a leaf with greater keys
+ * returns 0 if it found something or 1 if there are no greater leaves.
+ * returns < 0 on io errors.
+ */
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+        int slot;
+        int level = 1;
+        struct extent_buffer *c;
+        struct extent_buffer *next = NULL;
+        struct btrfs_key key;
+        u32 nritems;
+        int ret;
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        if (nritems == 0)
+                return 1;
+        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+        btrfs_release_path(root, path);
+        path->keep_locks = 1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        path->keep_locks = 0;
+        if (ret < 0)
+                return ret;
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        /*
+         * by releasing the path above we dropped all our locks.  A balance
+         * could have added more items next to the key that used to be
+         * at the very end of the block.  So, check again here and
+         * advance the path if there are now more items available.
+         */
+        if (nritems > 0 && path->slots[0] < nritems - 1) {
+                path->slots[0]++;
+                goto done;
+        }
+        while (level < BTRFS_MAX_LEVEL) {
+                if (!path->nodes[level])
+                        return 1;
+                slot = path->slots[level] + 1;
+                c = path->nodes[level];
+                if (slot >= btrfs_header_nritems(c)) {
+                        level++;
+                        if (level == BTRFS_MAX_LEVEL)
+                                return 1;
+                        continue;
+                }
+                if (next) {
+                        btrfs_tree_unlock(next);
+                        free_extent_buffer(next);
+                }
+                if (level == 1 && (path->locks[1] || path->skip_locking) &&
+                    path->reada)
+                        reada_for_search(root, path, level, slot, 0);
+                next = read_node_slot(root, c, slot);
+                if (!path->skip_locking) {
+                        WARN_ON(!btrfs_tree_locked(c));
+                        btrfs_tree_lock(next);
+                }
+                break;
+        }
+        path->slots[level] = slot;
+        while (1) {
+                level--;
+                c = path->nodes[level];
+                if (path->locks[level])
+                        btrfs_tree_unlock(c);
+                free_extent_buffer(c);
+                path->nodes[level] = next;
+                path->slots[level] = 0;
+                if (!path->skip_locking)
+                        path->locks[level] = 1;
+                if (!level)
+                        break;
+                if (level == 1 && path->locks[1] && path->reada)
+                        reada_for_search(root, path, level, slot, 0);
+                next = read_node_slot(root, next, 0);
+                if (!path->skip_locking) {
+                        WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+                        btrfs_tree_lock(next);
+                }
+        }
+done:
+        unlock_up(path, 0, 1);
+        return 0;
+}
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_item(struct btrfs_root *root,
+                        struct btrfs_path *path, u64 min_objectid,
+                        int type)
+{
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        u32 nritems;
+        int ret;
+        while (1) {
+                if (path->slots[0] == 0) {
+                        ret = btrfs_prev_leaf(root, path);
+                        if (ret != 0)
+                                return ret;
+                } else {
+                        path->slots[0]--;
+                }
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (nritems == 0)
+                        return 1;
+                if (path->slots[0] == nritems)
+                        path->slots[0]--;
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.type == type)
+                        return 0;
+                if (found_key.objectid < min_objectid)
+                        break;
+                if (found_key.objectid == min_objectid &&
+                    found_key.type < type)
+                        break;
+        }
+        return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 000000000000..eee060f88113
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,2129 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
+#include <linux/version.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <asm/kmap_types.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "async-thread.h"
+struct btrfs_trans_handle;
+struct btrfs_transaction;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+extern struct kmem_cache *btrfs_bit_radix_cachep;
+extern struct kmem_cache *btrfs_path_cachep;
+struct btrfs_ordered_sum;
+#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_ACL_NOT_CACHED    ((void *)-1)
+#ifdef CONFIG_LOCKDEP
+# define BTRFS_MAX_LEVEL 7
+#else
+# define BTRFS_MAX_LEVEL 8
+#endif
+/* holds pointers to all of the tree roots */
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+/* stores information about which extents are in use, and reference counts */
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device.  The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+/*
+ * All files have objectids in this range.
+ */
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
+/*
+ * the device items go into the chunk tree.  The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32   0
+static int btrfs_csum_sizes[] = { 4, 0 };
+/* four bytes for CRC32 */
+#define BTRFS_EMPTY_DIR_SIZE 0
+#define BTRFS_FT_UNKNOWN        0
+#define BTRFS_FT_REG_FILE       1
+#define BTRFS_FT_DIR            2
+#define BTRFS_FT_CHRDEV         3
+#define BTRFS_FT_BLKDEV         4
+#define BTRFS_FT_FIFO           5
+#define BTRFS_FT_SOCK           6
+#define BTRFS_FT_SYMLINK        7
+#define BTRFS_FT_XATTR          8
+#define BTRFS_FT_MAX            9
+/*
+ * the key defines the order in the tree, and so it also defines (optimal)
+ * block layout.  objectid corresonds to the inode number.  The flags
+ * tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with flags of 1 might refer to the inode
+ * data, flags of 2 may point to file data in the btree and flags == 3
+ * may point to extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
+ * in cpu native order.  Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
+ */
+struct btrfs_disk_key {
+        __le64 objectid;
+        u8 type;
+        __le64 offset;
+} __attribute__ ((__packed__));
+struct btrfs_key {
+        u64 objectid;
+        u8 type;
+        u64 offset;
+} __attribute__ ((__packed__));
+struct btrfs_mapping_tree {
+        struct extent_map_tree map_tree;
+};
+#define BTRFS_UUID_SIZE 16
+struct btrfs_dev_item {
+        /* the internal btrfs device id */
+        __le64 devid;
+        /* size of the device */
+        __le64 total_bytes;
+        /* bytes used */
+        __le64 bytes_used;
+        /* optimal io alignment for this device */
+        __le32 io_align;
+        /* optimal io width for this device */
+        __le32 io_width;
+        /* minimal io size for this device */
+        __le32 sector_size;
+        /* type and info about this device */
+        __le64 type;
+        /* expected generation for this device */
+        __le64 generation;
+        /*
+         * starting byte of this partition on the device,
+         * to allowr for stripe alignment in the future
+         */
+        __le64 start_offset;
+        /* grouping information for allocation decisions */
+        __le32 dev_group;
+        /* seek speed 0-100 where 100 is fastest */
+        u8 seek_speed;
+        /* bandwidth 0-100 where 100 is fastest */
+        u8 bandwidth;
+        /* btrfs generated uuid for this device */
+        u8 uuid[BTRFS_UUID_SIZE];
+        /* uuid of FS who owns this device */
+        u8 fsid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+struct btrfs_stripe {
+        __le64 devid;
+        __le64 offset;
+        u8 dev_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+struct btrfs_chunk {
+        /* size of this chunk in bytes */
+        __le64 length;
+        /* objectid of the root referencing this chunk */
+        __le64 owner;
+        __le64 stripe_len;
+        __le64 type;
+        /* optimal io alignment for this chunk */
+        __le32 io_align;
+        /* optimal io width for this chunk */
+        __le32 io_width;
+        /* minimal io size for this chunk */
+        __le32 sector_size;
+        /* 2^16 stripes is quite a lot, a second limit is the size of a single
+         * item in the btree
+         */
+        __le16 num_stripes;
+        /* sub stripes only matter for raid10 */
+        __le16 sub_stripes;
+        struct btrfs_stripe stripe;
+        /* additional stripes go here */
+} __attribute__ ((__packed__));
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+        BUG_ON(num_stripes == 0);
+        return sizeof(struct btrfs_chunk) +
+                sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+#define BTRFS_FSID_SIZE 16
+#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
+struct btrfs_header {
+        /* these first four must match the super block */
+        u8 csum[BTRFS_CSUM_SIZE];
+        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+        __le64 bytenr; /* which block this node is supposed to live in */
+        __le64 flags;
+        /* allowed to be different from the super from here on down */
+        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+        __le64 generation;
+        __le64 owner;
+        __le32 nritems;
+        u8 level;
+} __attribute__ ((__packed__));
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
+                                      sizeof(struct btrfs_header)) / \
+                                     sizeof(struct btrfs_key_ptr))
+#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
+#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+                                        sizeof(struct btrfs_item) - \
+                                        sizeof(struct btrfs_file_extent_item))
+#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
+struct btrfs_super_block {
+        u8 csum[BTRFS_CSUM_SIZE];
+        /* the first 4 fields must match struct btrfs_header */
+        u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
+        __le64 bytenr; /* this block number */
+        __le64 flags;
+        /* allowed to be different from the btrfs_header from here own down */
+        __le64 magic;
+        __le64 generation;
+        __le64 root;
+        __le64 chunk_root;
+        __le64 log_root;
+        /* this will help find the new super based on the log root */
+        __le64 log_root_transid;
+        __le64 total_bytes;
+        __le64 bytes_used;
+        __le64 root_dir_objectid;
+        __le64 num_devices;
+        __le32 sectorsize;
+        __le32 nodesize;
+        __le32 leafsize;
+        __le32 stripesize;
+        __le32 sys_chunk_array_size;
+        __le64 chunk_root_generation;
+        __le64 compat_flags;
+        __le64 compat_ro_flags;
+        __le64 incompat_flags;
+        __le16 csum_type;
+        u8 root_level;
+        u8 chunk_root_level;
+        u8 log_root_level;
+        struct btrfs_dev_item dev_item;
+        char label[BTRFS_LABEL_SIZE];
+        /* future expansion */
+        __le64 reserved[32];
+        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+} __attribute__ ((__packed__));
+/*
+ * Compat flags that we support.  If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_COMPAT_SUPP       0x0
+#define BTRFS_FEATURE_COMPAT_RO_SUPP    0x0
+#define BTRFS_FEATURE_INCOMPAT_SUPP     0x0
+/*
+ * A leaf is full of items. offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
+struct btrfs_item {
+        struct btrfs_disk_key key;
+        __le32 offset;
+        __le32 size;
+} __attribute__ ((__packed__));
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
+struct btrfs_leaf {
+        struct btrfs_header header;
+        struct btrfs_item items[];
+} __attribute__ ((__packed__));
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
+struct btrfs_key_ptr {
+        struct btrfs_disk_key key;
+        __le64 blockptr;
+        __le64 generation;
+} __attribute__ ((__packed__));
+struct btrfs_node {
+        struct btrfs_header header;
+        struct btrfs_key_ptr ptrs[];
+} __attribute__ ((__packed__));
+/*
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
+struct btrfs_path {
+        struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
+        int slots[BTRFS_MAX_LEVEL];
+        /* if there is real range locking, this locks field will change */
+        int locks[BTRFS_MAX_LEVEL];
+        int reada;
+        /* keep some upper locks as we walk down */
+        int keep_locks;
+        int skip_locking;
+        int lowest_level;
+        /*
+         * set by btrfs_split_item, tells search_slot to keep all locks
+         * and to force calls to keep space in the nodes
+         */
+        int search_for_split;
+};
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+struct btrfs_extent_item {
+        __le32 refs;
+} __attribute__ ((__packed__));
+struct btrfs_extent_ref {
+        __le64 root;
+        __le64 generation;
+        __le64 objectid;
+        __le32 num_refs;
+} __attribute__ ((__packed__));
+/* dev extents record free space on individual devices.  The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent.  The chunk tree uuid field is a way to double check the owner
+ */
+struct btrfs_dev_extent {
+        __le64 chunk_tree;
+        __le64 chunk_objectid;
+        __le64 chunk_offset;
+        __le64 length;
+        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+} __attribute__ ((__packed__));
+struct btrfs_inode_ref {
+        __le64 index;
+        __le16 name_len;
+        /* name goes here */
+} __attribute__ ((__packed__));
+struct btrfs_timespec {
+        __le64 sec;
+        __le32 nsec;
+} __attribute__ ((__packed__));
+typedef enum {
+        BTRFS_COMPRESS_NONE = 0,
+        BTRFS_COMPRESS_ZLIB = 1,
+        BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+/* we don't understand any encryption methods right now */
+typedef enum {
+        BTRFS_ENCRYPTION_NONE = 0,
+        BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+struct btrfs_inode_item {
+        /* nfs style generation number */
+        __le64 generation;
+        /* transid that last touched this inode */
+        __le64 transid;
+        __le64 size;
+        __le64 nbytes;
+        __le64 block_group;
+        __le32 nlink;
+        __le32 uid;
+        __le32 gid;
+        __le32 mode;
+        __le64 rdev;
+        __le64 flags;
+        /* modification sequence number for NFS */
+        __le64 sequence;
+        /*
+         * a little future expansion, for more than this we can
+         * just grow the inode item and version it
+         */
+        __le64 reserved[4];
+        struct btrfs_timespec atime;
+        struct btrfs_timespec ctime;
+        struct btrfs_timespec mtime;
+        struct btrfs_timespec otime;
+} __attribute__ ((__packed__));
+struct btrfs_dir_log_item {
+        __le64 end;
+} __attribute__ ((__packed__));
+struct btrfs_dir_item {
+        struct btrfs_disk_key location;
+        __le64 transid;
+        __le16 data_len;
+        __le16 name_len;
+        u8 type;
+} __attribute__ ((__packed__));
+struct btrfs_root_item {
+        struct btrfs_inode_item inode;
+        __le64 generation;
+        __le64 root_dirid;
+        __le64 bytenr;
+        __le64 byte_limit;
+        __le64 bytes_used;
+        __le64 last_snapshot;
+        __le64 flags;
+        __le32 refs;
+        struct btrfs_disk_key drop_progress;
+        u8 drop_level;
+        u8 level;
+} __attribute__ ((__packed__));
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+        __le64 dirid;
+        __le64 sequence;
+        __le16 name_len;
+} __attribute__ ((__packed__));
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
+struct btrfs_file_extent_item {
+        /*
+         * transaction id that created this extent
+         */
+        __le64 generation;
+        /*
+         * max number of bytes to hold this extent in ram
+         * when we split a compressed extent we can't know how big
+         * each of the resulting pieces will be.  So, this is
+         * an upper limit on the size of the extent in ram instead of
+         * an exact limit.
+         */
+        __le64 ram_bytes;
+        /*
+         * 32 bits for the various ways we might encode the data,
+         * including compression and encryption.  If any of these
+         * are set to something a given disk format doesn't understand
+         * it is treated like an incompat flag for reading and writing,
+         * but not for stat.
+         */
+        u8 compression;
+        u8 encryption;
+        __le16 other_encoding; /* spare for later use */
+        /* are we inline data or a real extent? */
+        u8 type;
+        /*
+         * disk space consumed by the extent, checksum blocks are included
+         * in these numbers
+         */
+        __le64 disk_bytenr;
+        __le64 disk_num_bytes;
+        /*
+         * the logical offset in file blocks (no csums)
+         * this extent record is for.  This allows a file extent to point
+         * into the middle of an existing extent on disk, sharing it
+         * between two snapshots (useful if some bytes in the middle of the
+         * extent have changed
+         */
+        __le64 offset;
+        /*
+         * the logical number of file blocks (no csums included).  This
+         * always reflects the size uncompressed and without encoding.
+         */
+        __le64 num_bytes;
+} __attribute__ ((__packed__));
+struct btrfs_csum_item {
+        u8 csum;
+} __attribute__ ((__packed__));
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
+#define BTRFS_BLOCK_GROUP_DUP      (1 << 5)
+#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+struct btrfs_block_group_item {
+        __le64 used;
+        __le64 chunk_objectid;
+        __le64 flags;
+} __attribute__ ((__packed__));
+struct btrfs_space_info {
+        u64 flags;
+        u64 total_bytes;
+        u64 bytes_used;
+        u64 bytes_pinned;
+        u64 bytes_reserved;
+        u64 bytes_readonly;
+        int full;
+        int force_alloc;
+        struct list_head list;
+        /* for block groups in our same type */
+        struct list_head block_groups;
+        spinlock_t lock;
+        struct rw_semaphore groups_sem;
+};
+struct btrfs_free_space {
+        struct rb_node bytes_index;
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+};
+struct btrfs_block_group_cache {
+        struct btrfs_key key;
+        struct btrfs_block_group_item item;
+        spinlock_t lock;
+        struct mutex alloc_mutex;
+        struct mutex cache_mutex;
+        u64 pinned;
+        u64 reserved;
+        u64 flags;
+        int cached;
+        int ro;
+        int dirty;
+        struct btrfs_space_info *space_info;
+        /* free space cache stuff */
+        struct rb_root free_space_bytes;
+        struct rb_root free_space_offset;
+        /* block group cache stuff */
+        struct rb_node cache_node;
+        /* for block groups in the same raid type */
+        struct list_head list;
+        /* usage count */
+        atomic_t count;
+};
+struct btrfs_leaf_ref_tree {
+        struct rb_root root;
+        struct list_head list;
+        spinlock_t lock;
+};
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct btrfs_fs_info {
+        u8 fsid[BTRFS_FSID_SIZE];
+        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
+        struct btrfs_root *extent_root;
+        struct btrfs_root *tree_root;
+        struct btrfs_root *chunk_root;
+        struct btrfs_root *dev_root;
+        struct btrfs_root *fs_root;
+        struct btrfs_root *csum_root;
+        /* the log root tree is a directory of all the other log roots */
+        struct btrfs_root *log_root_tree;
+        struct radix_tree_root fs_roots_radix;
+        /* block group cache stuff */
+        spinlock_t block_group_cache_lock;
+        struct rb_root block_group_cache_tree;
+        struct extent_io_tree pinned_extents;
+        struct extent_io_tree pending_del;
+        struct extent_io_tree extent_ins;
+        /* logical->physical extent mapping */
+        struct btrfs_mapping_tree mapping_tree;
+        u64 generation;
+        u64 last_trans_committed;
+        u64 last_trans_new_blockgroup;
+        u64 open_ioctl_trans;
+        unsigned long mount_opt;
+        u64 max_extent;
+        u64 max_inline;
+        u64 alloc_start;
+        struct btrfs_transaction *running_transaction;
+        wait_queue_head_t transaction_throttle;
+        wait_queue_head_t transaction_wait;
+        wait_queue_head_t async_submit_wait;
+        wait_queue_head_t tree_log_wait;
+        struct btrfs_super_block super_copy;
+        struct btrfs_super_block super_for_commit;
+        struct block_device *__bdev;
+        struct super_block *sb;
+        struct inode *btree_inode;
+        struct backing_dev_info bdi;
+        spinlock_t hash_lock;
+        struct mutex trans_mutex;
+        struct mutex tree_log_mutex;
+        struct mutex transaction_kthread_mutex;
+        struct mutex cleaner_mutex;
+        struct mutex extent_ins_mutex;
+        struct mutex pinned_mutex;
+        struct mutex chunk_mutex;
+        struct mutex drop_mutex;
+        struct mutex volume_mutex;
+        struct mutex tree_reloc_mutex;
+        struct list_head trans_list;
+        struct list_head hashers;
+        struct list_head dead_roots;
+        atomic_t nr_async_submits;
+        atomic_t async_submit_draining;
+        atomic_t nr_async_bios;
+        atomic_t async_delalloc_pages;
+        atomic_t tree_log_writers;
+        atomic_t tree_log_commit;
+        unsigned long tree_log_batch;
+        u64 tree_log_transid;
+        /*
+         * this is used by the balancing code to wait for all the pending
+         * ordered extents
+         */
+        spinlock_t ordered_extent_lock;
+        struct list_head ordered_extents;
+        struct list_head delalloc_inodes;
+        /*
+         * there is a pool of worker threads for checksumming during writes
+         * and a pool for checksumming after reads.  This is because readers
+         * can run with FS locks held, and the writers may be waiting for
+         * those locks.  We don't want ordering in the pending list to cause
+         * deadlocks, and so the two are serviced separately.
+         *
+         * A third pool does submit_bio to avoid deadlocking with the other
+         * two
+         */
+        struct btrfs_workers workers;
+        struct btrfs_workers delalloc_workers;
+        struct btrfs_workers endio_workers;
+        struct btrfs_workers endio_meta_workers;
+        struct btrfs_workers endio_meta_write_workers;
+        struct btrfs_workers endio_write_workers;
+        struct btrfs_workers submit_workers;
+        /*
+         * fixup workers take dirty pages that didn't properly go through
+         * the cow mechanism and make them safe to write.  It happens
+         * for the sys_munmap function call path
+         */
+        struct btrfs_workers fixup_workers;
+        struct task_struct *transaction_kthread;
+        struct task_struct *cleaner_kthread;
+        int thread_pool_size;
+        /* tree relocation relocated fields */
+        struct list_head dead_reloc_roots;
+        struct btrfs_leaf_ref_tree reloc_ref_tree;
+        struct btrfs_leaf_ref_tree shared_ref_tree;
+        struct kobject super_kobj;
+        struct completion kobj_unregister;
+        int do_barriers;
+        int closing;
+        int log_root_recovering;
+        atomic_t throttles;
+        atomic_t throttle_gen;
+        u64 total_pinned;
+        struct list_head dirty_cowonly_roots;
+        struct btrfs_fs_devices *fs_devices;
+        struct list_head space_info;
+        spinlock_t delalloc_lock;
+        spinlock_t new_trans_lock;
+        u64 delalloc_bytes;
+        u64 last_alloc;
+        u64 last_data_alloc;
+        spinlock_t ref_cache_lock;
+        u64 total_ref_cache_size;
+        u64 avail_data_alloc_bits;
+        u64 avail_metadata_alloc_bits;
+        u64 avail_system_alloc_bits;
+        u64 data_alloc_profile;
+        u64 metadata_alloc_profile;
+        u64 system_alloc_profile;
+        void *bdev_holder;
+};
+/*
+ * in ram representation of the tree.  extent_root is used for all allocations
+ * and for the extent tree extent_root root.
+ */
+struct btrfs_dirty_root;
+struct btrfs_root {
+        struct extent_buffer *node;
+        /* the node lock is held while changing the node pointer */
+        spinlock_t node_lock;
+        struct extent_buffer *commit_root;
+        struct btrfs_leaf_ref_tree *ref_tree;
+        struct btrfs_leaf_ref_tree ref_tree_struct;
+        struct btrfs_dirty_root *dirty_root;
+        struct btrfs_root *log_root;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root_item root_item;
+        struct btrfs_key root_key;
+        struct btrfs_fs_info *fs_info;
+        struct extent_io_tree dirty_log_pages;
+        struct kobject root_kobj;
+        struct completion kobj_unregister;
+        struct mutex objectid_mutex;
+        struct mutex log_mutex;
+        u64 objectid;
+        u64 last_trans;
+        /* data allocations are done in sectorsize units */
+        u32 sectorsize;
+        /* node allocations are done in nodesize units */
+        u32 nodesize;
+        /* leaf allocations are done in leafsize units */
+        u32 leafsize;
+        u32 stripesize;
+        u32 type;
+        u64 highest_inode;
+        u64 last_inode_alloc;
+        int ref_cows;
+        int track_dirty;
+        u64 defrag_trans_start;
+        struct btrfs_key defrag_progress;
+        struct btrfs_key defrag_max;
+        int defrag_running;
+        int defrag_level;
+        char *name;
+        int in_sysfs;
+        /* the dirty list is only used by non-reference counted roots */
+        struct list_head dirty_list;
+        spinlock_t list_lock;
+        struct list_head dead_list;
+        struct list_head orphan_list;
+        /*
+         * right now this just gets used so that a root has its own devid
+         * for stat.  It may be used for more later
+         */
+        struct super_block anon_super;
+};
+/*
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics.  There is one for every file and dir in
+ * the FS
+ */
+#define BTRFS_INODE_ITEM_KEY            1
+#define BTRFS_INODE_REF_KEY             12
+#define BTRFS_XATTR_ITEM_KEY            24
+#define BTRFS_ORPHAN_ITEM_KEY           48
+/* reserve 2-15 close to the inode for later flexibility */
+/*
+ * dir items are the name -> inode pointers in a directory.  There is one
+ * for every name in a directory.
+ */
+#define BTRFS_DIR_LOG_ITEM_KEY  60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY      84
+#define BTRFS_DIR_INDEX_KEY     96
+/*
+ * extent data is for file data
+ */
+#define BTRFS_EXTENT_DATA_KEY   108
+/*
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
+ */
+#define BTRFS_EXTENT_CSUM_KEY   128
+/*
+ * root items point to tree roots.  There are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY     132
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY  144
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root.  They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY      156
+/*
+ * extent items are in the extent map tree.  These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY   168
+#define BTRFS_EXTENT_REF_KEY    180
+/*
+ * block groups give us hints into the extent allocation trees.  Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
+#define BTRFS_DEV_EXTENT_KEY    204
+#define BTRFS_DEV_ITEM_KEY      216
+#define BTRFS_CHUNK_ITEM_KEY    228
+/*
+ * string items are for debugging.  They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY   253
+#define BTRFS_MOUNT_NODATASUM           (1 << 0)
+#define BTRFS_MOUNT_NODATACOW           (1 << 1)
+#define BTRFS_MOUNT_NOBARRIER           (1 << 2)
+#define BTRFS_MOUNT_SSD                 (1 << 3)
+#define BTRFS_MOUNT_DEGRADED            (1 << 4)
+#define BTRFS_MOUNT_COMPRESS            (1 << 5)
+#define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(root, opt)       ((root)->fs_info->mount_opt & \
+                                         BTRFS_MOUNT_##opt)
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM           (1 << 0)
+#define BTRFS_INODE_NODATACOW           (1 << 1)
+#define BTRFS_INODE_READONLY            (1 << 2)
+#define BTRFS_INODE_NOCOMPRESS          (1 << 3)
+#define BTRFS_INODE_PREALLOC            (1 << 4)
+#define btrfs_clear_flag(inode, flag)   (BTRFS_I(inode)->flags &= \
+                                         ~BTRFS_INODE_##flag)
+#define btrfs_set_flag(inode, flag)     (BTRFS_I(inode)->flags |= \
+                                         BTRFS_INODE_##flag)
+#define btrfs_test_flag(inode, flag)    (BTRFS_I(inode)->flags & \
+                                         BTRFS_INODE_##flag)
+/* some macros to generate set/get funcs for the struct fields.  This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+#define read_eb_member(eb, ptr, type, member, result) (                 \
+        read_extent_buffer(eb, (char *)(result),                        \
+                           ((unsigned long)(ptr)) +                     \
+                            offsetof(type, member),                     \
+                           sizeof(((type *)0)->member)))
+#define write_eb_member(eb, ptr, type, member, result) (                \
+        write_extent_buffer(eb, (char *)(result),                       \
+                           ((unsigned long)(ptr)) +                     \
+                            offsetof(type, member),                     \
+                           sizeof(((type *)0)->member)))
+#ifndef BTRFS_SETGET_FUNCS
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)                    \
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);                \
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
+#endif
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)             \
+static inline u##bits btrfs_##name(struct extent_buffer *eb)            \
+{                                                                       \
+        type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+        u##bits res = le##bits##_to_cpu(p->member);                     \
+        kunmap_atomic(p, KM_USER0);                                     \
+        return res;                                                     \
+}                                                                       \
+static inline void btrfs_set_##name(struct extent_buffer *eb,           \
+                                    u##bits val)                        \
+{                                                                       \
+        type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+        p->member = cpu_to_le##bits(val);                               \
+        kunmap_atomic(p, KM_USER0);                                     \
+}
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)              \
+static inline u##bits btrfs_##name(type *s)                             \
+{                                                                       \
+        return le##bits##_to_cpu(s->member);                            \
+}                                                                       \
+static inline void btrfs_set_##name(type *s, u##bits val)               \
+{                                                                       \
+        s->member = cpu_to_le##bits(val);                               \
+}
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+                   start_offset, 64);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+                         total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+                         bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+                         io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+                         io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+                         sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+                         dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+                         seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+                         bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+                         generation, 64);
+static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+        return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+}
+static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+        return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+}
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+        return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+                         stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+                         io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+                         io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+                         sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+                         num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+                         sub_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+                                                   int nr)
+{
+        unsigned long offset = (unsigned long)c;
+        offset += offsetof(struct btrfs_chunk, stripe);
+        offset += nr * sizeof(struct btrfs_stripe);
+        return (struct btrfs_stripe *)offset;
+}
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+        return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+                                         struct btrfs_chunk *c, int nr)
+{
+        return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+                                             struct btrfs_chunk *c, int nr,
+                                             u64 val)
+{
+        btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+                                         struct btrfs_chunk *c, int nr)
+{
+        return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+                                             struct btrfs_chunk *c, int nr,
+                                             u64 val)
+{
+        btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+                         used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+                         used, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+                        struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
+                   struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+                   struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+                        struct btrfs_block_group_item, flags, 64);
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+static inline struct btrfs_timespec *
+btrfs_inode_atime(struct btrfs_inode_item *inode_item)
+{
+        unsigned long ptr = (unsigned long)inode_item;
+        ptr += offsetof(struct btrfs_inode_item, atime);
+        return (struct btrfs_timespec *)ptr;
+}
+static inline struct btrfs_timespec *
+btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
+{
+        unsigned long ptr = (unsigned long)inode_item;
+        ptr += offsetof(struct btrfs_inode_item, mtime);
+        return (struct btrfs_timespec *)ptr;
+}
+static inline struct btrfs_timespec *
+btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
+{
+        unsigned long ptr = (unsigned long)inode_item;
+        ptr += offsetof(struct btrfs_inode_item, ctime);
+        return (struct btrfs_timespec *)ptr;
+}
+static inline struct btrfs_timespec *
+btrfs_inode_otime(struct btrfs_inode_item *inode_item)
+{
+        unsigned long ptr = (unsigned long)inode_item;
+        ptr += offsetof(struct btrfs_inode_item, otime);
+        return (struct btrfs_timespec *)ptr;
+}
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+                   chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+                   chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+                   chunk_offset, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+        unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+        return (u8 *)((unsigned long)dev + ptr);
+}
+/* struct btrfs_extent_ref */
+BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
+                         generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
+                         objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
+                         num_refs, 32);
+/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
+                         refs, 32);
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
+{
+        unsigned long ptr;
+        ptr = offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+        return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
+}
+static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+                                           int nr, u64 val)
+{
+        unsigned long ptr;
+        ptr = offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+        btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+{
+        unsigned long ptr;
+        ptr = offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+        return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+                                                 int nr, u64 val)
+{
+        unsigned long ptr;
+        ptr = offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+        btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
+{
+        return offsetof(struct btrfs_node, ptrs) +
+                sizeof(struct btrfs_key_ptr) * nr;
+}
+void btrfs_node_key(struct extent_buffer *eb,
+                    struct btrfs_disk_key *disk_key, int nr);
+static inline void btrfs_set_node_key(struct extent_buffer *eb,
+                                      struct btrfs_disk_key *disk_key, int nr)
+{
+        unsigned long ptr;
+        ptr = btrfs_node_key_ptr_offset(nr);
+        write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+                       struct btrfs_key_ptr, key, disk_key);
+}
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+static inline unsigned long btrfs_item_nr_offset(int nr)
+{
+        return offsetof(struct btrfs_leaf, items) +
+                sizeof(struct btrfs_item) * nr;
+}
+static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
+                                               int nr)
+{
+        return (struct btrfs_item *)btrfs_item_nr_offset(nr);
+}
+static inline u32 btrfs_item_end(struct extent_buffer *eb,
+                                 struct btrfs_item *item)
+{
+        return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
+}
+static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
+{
+        return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+}
+static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
+{
+        return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+}
+static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
+{
+        return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+}
+static inline void btrfs_item_key(struct extent_buffer *eb,
+                           struct btrfs_disk_key *disk_key, int nr)
+{
+        struct btrfs_item *item = btrfs_item_nr(eb, nr);
+        read_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+                               struct btrfs_disk_key *disk_key, int nr)
+{
+        struct btrfs_item *item = btrfs_item_nr(eb, nr);
+        write_eb_member(eb, item, struct btrfs_item, key, disk_key);
+}
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+static inline void btrfs_dir_item_key(struct extent_buffer *eb,
+                                      struct btrfs_dir_item *item,
+                                      struct btrfs_disk_key *key)
+{
+        read_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+                                          struct btrfs_dir_item *item,
+                                          struct btrfs_disk_key *key)
+{
+        write_eb_member(eb, item, struct btrfs_dir_item, location, key);
+}
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+                         objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+                                         struct btrfs_disk_key *disk)
+{
+        cpu->offset = le64_to_cpu(disk->offset);
+        cpu->type = disk->type;
+        cpu->objectid = le64_to_cpu(disk->objectid);
+}
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+                                         struct btrfs_key *cpu)
+{
+        disk->offset = cpu_to_le64(cpu->offset);
+        disk->type = cpu->type;
+        disk->objectid = cpu_to_le64(cpu->objectid);
+}
+static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
+                                  struct btrfs_key *key, int nr)
+{
+        struct btrfs_disk_key disk_key;
+        btrfs_node_key(eb, &disk_key, nr);
+        btrfs_disk_key_to_cpu(key, &disk_key);
+}
+static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
+                                  struct btrfs_key *key, int nr)
+{
+        struct btrfs_disk_key disk_key;
+        btrfs_item_key(eb, &disk_key, nr);
+        btrfs_disk_key_to_cpu(key, &disk_key);
+}
+static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
+                                      struct btrfs_dir_item *item,
+                                      struct btrfs_key *key)
+{
+        struct btrfs_disk_key disk_key;
+        btrfs_dir_item_key(eb, item, &disk_key);
+        btrfs_disk_key_to_cpu(key, &disk_key);
+}
+static inline u8 btrfs_key_type(struct btrfs_key *key)
+{
+        return key->type;
+}
+static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
+{
+        key->type = val;
+}
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+                          generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+{
+        return (btrfs_header_flags(eb) & flag) == flag;
+}
+static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+        u64 flags = btrfs_header_flags(eb);
+        btrfs_set_header_flags(eb, flags | flag);
+        return (flags & flag) == flag;
+}
+static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+        u64 flags = btrfs_header_flags(eb);
+        btrfs_set_header_flags(eb, flags & ~flag);
+        return (flags & flag) == flag;
+}
+static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
+{
+        unsigned long ptr = offsetof(struct btrfs_header, fsid);
+        return (u8 *)ptr;
+}
+static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+        unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
+        return (u8 *)ptr;
+}
+static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
+{
+        unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
+        return (u8 *)ptr;
+}
+static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
+{
+        unsigned long ptr = offsetof(struct btrfs_header, csum);
+        return (u8 *)ptr;
+}
+static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
+{
+        return NULL;
+}
+static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
+{
+        return NULL;
+}
+static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
+{
+        return NULL;
+}
+static inline int btrfs_is_leaf(struct extent_buffer *eb)
+{
+        return btrfs_header_level(eb) == 0;
+}
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+                   generation, 64);
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+                         generation, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+                         last_snapshot, 64);
+/* struct btrfs_super_block */
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+                         generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+                         struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+                         struct btrfs_super_block, chunk_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+                         root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+                         chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+                         chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+                         log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+                         log_root_transid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+                         log_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+                         total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+                         bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+                         sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+                         nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
+                         leafsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+                         stripesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+                         root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+                         num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+                         compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+                         compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+                         incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+                         csum_type, 16);
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+        int t = btrfs_super_csum_type(s);
+        BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+        return btrfs_csum_sizes[t];
+}
+static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
+{
+        return offsetof(struct btrfs_leaf, items);
+}
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
+{
+        unsigned long offset = (unsigned long)e;
+        offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
+        return offset;
+}
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+        return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
+}
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+                   disk_bytenr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+                   generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+                   disk_num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+                  offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+                   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+                   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+                   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+                   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+                   other_encoding, 16);
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+                                               struct btrfs_file_extent_item *e)
+{
+        return btrfs_file_extent_ram_bytes(eb, e);
+}
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+                                                    struct btrfs_item *e)
+{
+        unsigned long offset;
+        offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+        return btrfs_item_size(eb, e) - offset;
+}
+static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline int btrfs_set_root_name(struct btrfs_root *root,
+                                      const char *name, int len)
+{
+        /* if we already have a name just free it */
+        kfree(root->name);
+        root->name = kmalloc(len+1, GFP_KERNEL);
+        if (!root->name)
+                return -ENOMEM;
+        memcpy(root->name, name, len);
+        root->name[len] = '\0';
+        return 0;
+}
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
+{
+        if (level == 0)
+                return root->leafsize;
+        return root->nodesize;
+}
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+        ((type *)(btrfs_leaf_data(leaf) + \
+        btrfs_item_offset_nr(leaf, slot)))
+#define btrfs_item_ptr_offset(leaf, slot) \
+        ((unsigned long)(btrfs_leaf_data(leaf) + \
+        btrfs_item_offset_nr(leaf, slot)))
+static inline struct dentry *fdentry(struct file *file)
+{
+        return file->f_path.dentry;
+}
+/* extent-tree.c */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 num_bytes, u32 *refs);
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+                                u64 bytenr, u64 num, int pin);
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 objectid, u64 bytenr);
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root);
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+                                                 struct btrfs_fs_info *info,
+                                                 u64 bytenr);
+u64 btrfs_find_block_group(struct btrfs_root *root,
+                           u64 search_start, u64 search_hint, int owner);
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root,
+                                             u32 blocksize, u64 parent,
+                                             u64 root_objectid,
+                                             u64 ref_generation,
+                                             int level,
+                                             u64 hint,
+                                             u64 empty_size);
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+                                            struct btrfs_root *root,
+                                            u64 bytenr, u32 blocksize);
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       u64 num_bytes, u64 parent, u64 min_bytes,
+                       u64 root_objectid, u64 ref_generation,
+                       u64 owner, u64 empty_size, u64 hint_byte,
+                       u64 search_end, struct btrfs_key *ins, u64 data);
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, u64 parent,
+                                u64 root_objectid, u64 ref_generation,
+                                u64 owner, struct btrfs_key *ins);
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, u64 parent,
+                                u64 root_objectid, u64 ref_generation,
+                                u64 owner, struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  u64 num_bytes, u64 min_alloc_size,
+                                  u64 empty_size, u64 hint_byte,
+                                  u64 search_end, struct btrfs_key *ins,
+                                  u64 data);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+                  u32 *nr_extents);
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                    struct extent_buffer *buf, u32 nr_extents);
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                     struct extent_buffer *buf, int start_slot, int nr);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      u64 bytenr, u64 num_bytes, u64 parent,
+                      u64 root_objectid, u64 ref_generation,
+                      u64 owner_objectid, int pin);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct extent_io_tree *unpin);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         u64 bytenr, u64 num_bytes, u64 parent,
+                         u64 root_objectid, u64 ref_generation,
+                         u64 owner_objectid);
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 orig_parent, u64 parent,
+                            u64 root_objectid, u64 ref_generation,
+                            u64 owner_objectid);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 bytes_used,
+                           u64 type, u64 chunk_objectid, u64 chunk_offset,
+                           u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root);
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct extent_buffer *buf, u64 orig_start);
+int btrfs_add_dead_reloc_root(struct btrfs_root *root);
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+/* ctree.c */
+int btrfs_previous_item(struct btrfs_root *root,
+                        struct btrfs_path *path, u64 min_objectid,
+                        int type);
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_key *node_keys,
+                     u64 *nodes, int lowest_level);
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, struct btrfs_path *path,
+                            struct btrfs_key *new_key);
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+                        struct btrfs_key *key, int lowest_level,
+                        int cache_only, u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+                         struct btrfs_key *max_key,
+                         struct btrfs_path *path, int cache_only,
+                         u64 min_trans);
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct extent_buffer *buf,
+                    struct extent_buffer *parent, int parent_slot,
+                    struct extent_buffer **cow_ret, u64 prealloc_dest);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      struct extent_buffer *buf,
+                      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_path *path, u32 data_size);
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_path *path,
+                     struct btrfs_key *new_key,
+                     unsigned long split_offset);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_path *p, int
+                      ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct extent_buffer *parent,
+                       int start_slot, int cache_only, u64 *last_ret,
+                       struct btrfs_key *progress);
+void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
+void btrfs_init_path(struct btrfs_path *p);
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_path *path, int slot, int nr);
+int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path, u64 bytenr);
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path)
+{
+        return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct btrfs_key *cpu_key, u32 *data_size,
+                            int nr);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path,
+                             struct btrfs_key *cpu_key, u32 *data_size, int nr);
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          struct btrfs_key *key,
+                                          u32 data_size)
+{
+        return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+}
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+                        *root);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct extent_buffer *node,
+                        struct extent_buffer *parent);
+/* root-item.c */
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+                   struct btrfs_path *path,
+                   u64 root_id, u64 ref_id);
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *tree_root,
+                       u64 root_id, u8 type, u64 ref_id,
+                       u64 dirid, u64 sequence,
+                       const char *name, int name_len);
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_root_item
+                      *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_root_item
+                      *item);
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+                         btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+                      u64 *found_objectid);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+                          struct btrfs_root *latest_root);
+/* dir-item.c */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, const char *name,
+                          int name_len, u64 dir,
+                          struct btrfs_key *location, u8 type, u64 index);
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root,
+                                             struct btrfs_path *path, u64 dir,
+                                             const char *name, int name_len,
+                                             int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path, u64 dir,
+                            u64 objectid, const char *name, int name_len,
+                            int mod);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              const char *name, int name_len);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, const char *name,
+                            u16 name_len, const void *data, u16 data_len,
+                            u64 dir);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path, u64 dir,
+                                          const char *name, u16 name_len,
+                                          int mod);
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 offset);
+/* inode-map.c */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *fs_root,
+                             u64 dirid, u64 *objectid);
+int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
+/* inode-item.c */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           const char *name, int name_len,
+                           u64 inode_objectid, u64 ref_objectid, u64 index);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           const char *name, int name_len,
+                           u64 inode_objectid, u64 ref_objectid, u64 *index);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+                       *root, struct btrfs_path *path,
+                       struct btrfs_key *location, int mod);
+/* file-item.c */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, u64 bytenr, u64 len);
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                          struct bio *bio, u32 *dst);
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             u64 objectid, u64 pos,
+                             u64 disk_offset, u64 disk_num_bytes,
+                             u64 num_bytes, u64 offset, u64 ram_bytes,
+                             u8 compression, u8 encryption, u16 other_encoding);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid,
+                             u64 bytenr, int mod);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_ordered_sum *sums);
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+                       struct bio *bio, u64 file_start, int contig);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+                          u64 start, unsigned long len);
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, int cow);
+int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, struct btrfs_path *path,
+                        u64 isize);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
+                             u64 end, struct list_head *list);
+/* inode.c */
+/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
+#define ClearPageChecked ClearPageFsMisc
+#define SetPageChecked SetPageFsMisc
+#define PageChecked PageFsMisc
+#endif
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct inode *dir, u64 *index);
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct inode *dir, struct inode *inode,
+                       const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+                   struct inode *parent_inode, struct inode *inode,
+                   const char *name, int name_len, int add_backref, u64 index);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *inode, u64 new_size,
+                               u32 min_type);
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+int btrfs_writepages(struct address_space *mapping,
+                     struct writeback_control *wbc);
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *new_root, struct dentry *dentry,
+                             u64 new_dirid, u64 alloc_hint);
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+                         size_t size, struct bio *bio, unsigned long bio_flags);
+unsigned long btrfs_force_ra(struct address_space *mapping,
+                              struct file_ra_state *ra, struct file *file,
+                              pgoff_t offset, pgoff_t last_index);
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+                           int for_del);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_readpage(struct file *file, struct page *page);
+void btrfs_delete_inode(struct inode *inode);
+void btrfs_put_inode(struct inode *inode);
+void btrfs_read_locked_inode(struct inode *inode);
+int btrfs_write_inode(struct inode *inode, int wait);
+void btrfs_dirty_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+int btrfs_init_cachep(void);
+void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+                            struct btrfs_root *root, int wait);
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+                                struct btrfs_root *root);
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+                         struct btrfs_root *root, int *is_new);
+int btrfs_commit_write(struct file *file, struct page *page,
+                       unsigned from, unsigned to);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+                                    size_t page_offset, u64 start, u64 end,
+                                    int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+void btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t size);
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+/* file.c */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+                            int skip_pinned);
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
+extern struct file_operations btrfs_file_operations;
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct inode *inode,
+                       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct inode *inode, u64 start, u64 end);
+int btrfs_release_file(struct inode *inode, struct file *file);
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, int cache_only);
+/* sysfs.c */
+int btrfs_init_sysfs(void);
+void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
+int btrfs_sysfs_add_root(struct btrfs_root *root);
+void btrfs_sysfs_del_root(struct btrfs_root *root);
+void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
+/* xattr.c */
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+/* super.c */
+u64 btrfs_parse_size(char *str);
+int btrfs_parse_options(struct btrfs_root *root, char *options);
+int btrfs_sync_fs(struct super_block *sb, int wait);
+/* acl.c */
+int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_init_acl(struct inode *inode, struct inode *dir);
+int btrfs_acl_chmod(struct inode *inode);
+/* free-space-cache.c */
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                         u64 bytenr, u64 size);
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+                              u64 offset, u64 bytes);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                            u64 bytenr, u64 size);
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+                                 u64 offset, u64 bytes);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+                                   *block_group);
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+                                               *block_group, u64 offset,
+                                               u64 bytes);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+                           u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 000000000000..926a0b287a7d
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,386 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+#include "transaction.h"
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision.  data_size indicates how big the item inserted should be.  On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+                                                   *trans,
+                                                   struct btrfs_root *root,
+                                                   struct btrfs_path *path,
+                                                   struct btrfs_key *cpu_key,
+                                                   u32 data_size,
+                                                   const char *name,
+                                                   int name_len)
+{
+        int ret;
+        char *ptr;
+        struct btrfs_item *item;
+        struct extent_buffer *leaf;
+        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+        if (ret == -EEXIST) {
+                struct btrfs_dir_item *di;
+                di = btrfs_match_dir_item_name(root, path, name, name_len);
+                if (di)
+                        return ERR_PTR(-EEXIST);
+                ret = btrfs_extend_item(trans, root, path, data_size);
+                WARN_ON(ret > 0);
+        }
+        if (ret < 0)
+                return ERR_PTR(ret);
+        WARN_ON(ret > 0);
+        leaf = path->nodes[0];
+        item = btrfs_item_nr(leaf, path->slots[0]);
+        ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+        BUG_ON(data_size > btrfs_item_size(leaf, item));
+        ptr += btrfs_item_size(leaf, item) - data_size;
+        return (struct btrfs_dir_item *)ptr;
+}
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, const char *name,
+                            u16 name_len, const void *data, u16 data_len,
+                            u64 dir)
+{
+        int ret = 0;
+        struct btrfs_path *path;
+        struct btrfs_dir_item *dir_item;
+        unsigned long name_ptr, data_ptr;
+        struct btrfs_key key, location;
+        struct btrfs_disk_key disk_key;
+        struct extent_buffer *leaf;
+        u32 data_size;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        if (name_len + data_len + sizeof(struct btrfs_dir_item) >
+            BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
+                return -ENOSPC;
+        data_size = sizeof(*dir_item) + name_len + data_len;
+        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+                                        name, name_len);
+        /*
+         * FIXME: at some point we should handle xattr's that are larger than
+         * what we can fit in our leaf.  We set location to NULL b/c we arent
+         * pointing at anything else, that will change if we store the xattr
+         * data in a separate inode.
+         */
+        BUG_ON(IS_ERR(dir_item));
+        memset(&location, 0, sizeof(location));
+        leaf = path->nodes[0];
+        btrfs_cpu_key_to_disk(&disk_key, &location);
+        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+        btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+        btrfs_set_dir_name_len(leaf, dir_item, name_len);
+        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+        btrfs_set_dir_data_len(leaf, dir_item, data_len);
+        name_ptr = (unsigned long)(dir_item + 1);
+        data_ptr = (unsigned long)((char *)name_ptr + name_len);
+        write_extent_buffer(leaf, name, name_ptr, name_len);
+        write_extent_buffer(leaf, data, data_ptr, data_len);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ */
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, const char *name, int name_len, u64 dir,
+                          struct btrfs_key *location, u8 type, u64 index)
+{
+        int ret = 0;
+        int ret2 = 0;
+        struct btrfs_path *path;
+        struct btrfs_dir_item *dir_item;
+        struct extent_buffer *leaf;
+        unsigned long name_ptr;
+        struct btrfs_key key;
+        struct btrfs_disk_key disk_key;
+        u32 data_size;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        path = btrfs_alloc_path();
+        data_size = sizeof(*dir_item) + name_len;
+        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+                                        name, name_len);
+        if (IS_ERR(dir_item)) {
+                ret = PTR_ERR(dir_item);
+                if (ret == -EEXIST)
+                        goto second_insert;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        btrfs_cpu_key_to_disk(&disk_key, location);
+        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+        btrfs_set_dir_type(leaf, dir_item, type);
+        btrfs_set_dir_data_len(leaf, dir_item, 0);
+        btrfs_set_dir_name_len(leaf, dir_item, name_len);
+        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+        name_ptr = (unsigned long)(dir_item + 1);
+        write_extent_buffer(leaf, name, name_ptr, name_len);
+        btrfs_mark_buffer_dirty(leaf);
+second_insert:
+        /* FIXME, use some real flag for selecting the extra index */
+        if (root == root->fs_info->tree_root) {
+                ret = 0;
+                goto out;
+        }
+        btrfs_release_path(root, path);
+        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+        key.offset = index;
+        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+                                        name, name_len);
+        if (IS_ERR(dir_item)) {
+                ret2 = PTR_ERR(dir_item);
+                goto out;
+        }
+        leaf = path->nodes[0];
+        btrfs_cpu_key_to_disk(&disk_key, location);
+        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+        btrfs_set_dir_type(leaf, dir_item, type);
+        btrfs_set_dir_data_len(leaf, dir_item, 0);
+        btrfs_set_dir_name_len(leaf, dir_item, name_len);
+        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+        name_ptr = (unsigned long)(dir_item + 1);
+        write_extent_buffer(leaf, name, name_ptr, name_len);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        if (ret)
+                return ret;
+        if (ret2)
+                return ret2;
+        return 0;
+}
+/*
+ * lookup a directory item based on name.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root,
+                                             struct btrfs_path *path, u64 dir,
+                                             const char *name, int name_len,
+                                             int mod)
+{
+        int ret;
+        struct btrfs_key key;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0) {
+                if (path->slots[0] == 0)
+                        return NULL;
+                path->slots[0]--;
+        }
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        if (found_key.objectid != dir ||
+            btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
+            found_key.offset != key.offset)
+                return NULL;
+        return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+/*
+ * lookup a directory item based on index.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path, u64 dir,
+                            u64 objectid, const char *name, int name_len,
+                            int mod)
+{
+        int ret;
+        struct btrfs_key key;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+        key.offset = objectid;
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0)
+                return ERR_PTR(-ENOENT);
+        return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path, u64 dir,
+                                          const char *name, u16 name_len,
+                                          int mod)
+{
+        int ret;
+        struct btrfs_key key;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        key.objectid = dir;
+        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+        key.offset = btrfs_name_hash(name, name_len);
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0) {
+                if (path->slots[0] == 0)
+                        return NULL;
+                path->slots[0]--;
+        }
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        if (found_key.objectid != dir ||
+            btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
+            found_key.offset != key.offset)
+                return NULL;
+        return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              const char *name, int name_len)
+{
+        struct btrfs_dir_item *dir_item;
+        unsigned long name_ptr;
+        u32 total_len;
+        u32 cur = 0;
+        u32 this_len;
+        struct extent_buffer *leaf;
+        leaf = path->nodes[0];
+        dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+        total_len = btrfs_item_size_nr(leaf, path->slots[0]);
+        while (cur < total_len) {
+                this_len = sizeof(*dir_item) +
+                        btrfs_dir_name_len(leaf, dir_item) +
+                        btrfs_dir_data_len(leaf, dir_item);
+                name_ptr = (unsigned long)(dir_item + 1);
+                if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+                    memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
+                        return dir_item;
+                cur += this_len;
+                dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+                                                     this_len);
+        }
+        return NULL;
+}
+/*
+ * given a pointer into a directory item, delete it.  This
+ * handles items that have more than one entry in them.
+ */
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct btrfs_path *path,
+                              struct btrfs_dir_item *di)
+{
+        struct extent_buffer *leaf;
+        u32 sub_item_len;
+        u32 item_len;
+        int ret = 0;
+        leaf = path->nodes[0];
+        sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+                btrfs_dir_data_len(leaf, di);
+        item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+        if (sub_item_len == item_len) {
+                ret = btrfs_del_item(trans, root, path);
+        } else {
+                /* MARKER */
+                unsigned long ptr = (unsigned long)di;
+                unsigned long start;
+                start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+                        item_len - (ptr + sub_item_len - start));
+                ret = btrfs_truncate_item(trans, root, path,
+                                          item_len - sub_item_len, 1);
+        }
+        return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 000000000000..81a313874ae5
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,2343 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include "compat.h"
+#include "crc32c.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "async-thread.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "tree-log.h"
+static struct extent_io_ops btree_extent_io_ops;
+static void end_workqueue_fn(struct btrfs_work *work);
+/*
+ * end_io_wq structs are used to do processing in task context when an IO is
+ * complete.  This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
+struct end_io_wq {
+        struct bio *bio;
+        bio_end_io_t *end_io;
+        void *private;
+        struct btrfs_fs_info *info;
+        int error;
+        int metadata;
+        struct list_head list;
+        struct btrfs_work work;
+};
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads.  They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
+struct async_submit_bio {
+        struct inode *inode;
+        struct bio *bio;
+        struct list_head list;
+        extent_submit_bio_hook_t *submit_bio_start;
+        extent_submit_bio_hook_t *submit_bio_done;
+        int rw;
+        int mirror_num;
+        unsigned long bio_flags;
+        struct btrfs_work work;
+};
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
+static struct extent_map *btree_get_extent(struct inode *inode,
+                struct page *page, size_t page_offset, u64 start, u64 len,
+                int create)
+{
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
+        int ret;
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, start, len);
+        if (em) {
+                em->bdev =
+                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+                spin_unlock(&em_tree->lock);
+                goto out;
+        }
+        spin_unlock(&em_tree->lock);
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                em = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        em->start = 0;
+        em->len = (u64)-1;
+        em->block_len = (u64)-1;
+        em->block_start = 0;
+        em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        spin_lock(&em_tree->lock);
+        ret = add_extent_mapping(em_tree, em);
+        if (ret == -EEXIST) {
+                u64 failed_start = em->start;
+                u64 failed_len = em->len;
+                free_extent_map(em);
+                em = lookup_extent_mapping(em_tree, start, len);
+                if (em) {
+                        ret = 0;
+                } else {
+                        em = lookup_extent_mapping(em_tree, failed_start,
+                                                   failed_len);
+                        ret = -EIO;
+                }
+        } else if (ret) {
+                free_extent_map(em);
+                em = NULL;
+        }
+        spin_unlock(&em_tree->lock);
+        if (ret)
+                em = ERR_PTR(ret);
+out:
+        return em;
+}
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+{
+        return btrfs_crc32c(seed, data, len);
+}
+void btrfs_csum_final(u32 crc, char *result)
+{
+        *(__le32 *)result = ~cpu_to_le32(crc);
+}
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
+static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+                           int verify)
+{
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        char *result = NULL;
+        unsigned long len;
+        unsigned long cur_len;
+        unsigned long offset = BTRFS_CSUM_SIZE;
+        char *map_token = NULL;
+        char *kaddr;
+        unsigned long map_start;
+        unsigned long map_len;
+        int err;
+        u32 crc = ~(u32)0;
+        unsigned long inline_result;
+        len = buf->len - offset;
+        while (len > 0) {
+                err = map_private_extent_buffer(buf, offset, 32,
+                                        &map_token, &kaddr,
+                                        &map_start, &map_len, KM_USER0);
+                if (err)
+                        return 1;
+                cur_len = min(len, map_len - (offset - map_start));
+                crc = btrfs_csum_data(root, kaddr + offset - map_start,
+                                      crc, cur_len);
+                len -= cur_len;
+                offset += cur_len;
+                unmap_extent_buffer(buf, map_token, KM_USER0);
+        }
+        if (csum_size > sizeof(inline_result)) {
+                result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+                if (!result)
+                        return 1;
+        } else {
+                result = (char *)&inline_result;
+        }
+        btrfs_csum_final(crc, result);
+        if (verify) {
+                if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
+                        u32 val;
+                        u32 found = 0;
+                        memcpy(&found, result, csum_size);
+                        read_extent_buffer(buf, &val, 0, csum_size);
+                        printk(KERN_INFO "btrfs: %s checksum verify failed "
+                               "on %llu wanted %X found %X level %d\n",
+                               root->fs_info->sb->s_id,
+                               buf->start, val, found, btrfs_header_level(buf));
+                        if (result != (char *)&inline_result)
+                                kfree(result);
+                        return 1;
+                }
+        } else {
+                write_extent_buffer(buf, result, 0, csum_size);
+        }
+        if (result != (char *)&inline_result)
+                kfree(result);
+        return 0;
+}
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer.  This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+                                 struct extent_buffer *eb, u64 parent_transid)
+{
+        int ret;
+        if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+                return 0;
+        lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+        if (extent_buffer_uptodate(io_tree, eb) &&
+            btrfs_header_generation(eb) == parent_transid) {
+                ret = 0;
+                goto out;
+        }
+        printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+               (unsigned long long)eb->start,
+               (unsigned long long)parent_transid,
+               (unsigned long long)btrfs_header_generation(eb));
+        ret = 1;
+        clear_extent_buffer_uptodate(io_tree, eb);
+out:
+        unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+                      GFP_NOFS);
+        return ret;
+}
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+                                          struct extent_buffer *eb,
+                                          u64 start, u64 parent_transid)
+{
+        struct extent_io_tree *io_tree;
+        int ret;
+        int num_copies = 0;
+        int mirror_num = 0;
+        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+        while (1) {
+                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+                                               btree_get_extent, mirror_num);
+                if (!ret &&
+                    !verify_parent_transid(io_tree, eb, parent_transid))
+                        return ret;
+                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+                                              eb->start, eb->len);
+                if (num_copies == 1)
+                        return ret;
+                mirror_num++;
+                if (mirror_num > num_copies)
+                        return ret;
+        }
+        return -EIO;
+}
+/*
+ * checksum a dirty tree block before IO.  This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
+ */
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+{
+        struct extent_io_tree *tree;
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 found_start;
+        int found_level;
+        unsigned long len;
+        struct extent_buffer *eb;
+        int ret;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (page->private == EXTENT_PAGE_PRIVATE)
+                goto out;
+        if (!page->private)
+                goto out;
+        len = page->private >> 2;
+        WARN_ON(len == 0);
+        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
+                                             btrfs_header_generation(eb));
+        BUG_ON(ret);
+        found_start = btrfs_header_bytenr(eb);
+        if (found_start != start) {
+                WARN_ON(1);
+                goto err;
+        }
+        if (eb->first_page != page) {
+                WARN_ON(1);
+                goto err;
+        }
+        if (!PageUptodate(page)) {
+                WARN_ON(1);
+                goto err;
+        }
+        found_level = btrfs_header_level(eb);
+        csum_tree_block(root, eb, 0);
+err:
+        free_extent_buffer(eb);
+out:
+        return 0;
+}
+static int check_tree_block_fsid(struct btrfs_root *root,
+                                 struct extent_buffer *eb)
+{
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        u8 fsid[BTRFS_UUID_SIZE];
+        int ret = 1;
+        read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
+                           BTRFS_FSID_SIZE);
+        while (fs_devices) {
+                if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+                        ret = 0;
+                        break;
+                }
+                fs_devices = fs_devices->seed;
+        }
+        return ret;
+}
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+                               struct extent_state *state)
+{
+        struct extent_io_tree *tree;
+        u64 found_start;
+        int found_level;
+        unsigned long len;
+        struct extent_buffer *eb;
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        int ret = 0;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (page->private == EXTENT_PAGE_PRIVATE)
+                goto out;
+        if (!page->private)
+                goto out;
+        len = page->private >> 2;
+        WARN_ON(len == 0);
+        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        found_start = btrfs_header_bytenr(eb);
+        if (found_start != start) {
+                printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
+                       (unsigned long long)found_start,
+                       (unsigned long long)eb->start);
+                ret = -EIO;
+                goto err;
+        }
+        if (eb->first_page != page) {
+                printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+                       eb->first_page->index, page->index);
+                WARN_ON(1);
+                ret = -EIO;
+                goto err;
+        }
+        if (check_tree_block_fsid(root, eb)) {
+                printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+                       (unsigned long long)eb->start);
+                ret = -EIO;
+                goto err;
+        }
+        found_level = btrfs_header_level(eb);
+        ret = csum_tree_block(root, eb, 1);
+        if (ret)
+                ret = -EIO;
+        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
+        end = eb->start + end - 1;
+err:
+        free_extent_buffer(eb);
+out:
+        return ret;
+}
+static void end_workqueue_bio(struct bio *bio, int err)
+{
+        struct end_io_wq *end_io_wq = bio->bi_private;
+        struct btrfs_fs_info *fs_info;
+        fs_info = end_io_wq->info;
+        end_io_wq->error = err;
+        end_io_wq->work.func = end_workqueue_fn;
+        end_io_wq->work.flags = 0;
+        if (bio->bi_rw & (1 << BIO_RW)) {
+                if (end_io_wq->metadata)
+                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
+                                           &end_io_wq->work);
+                else
+                        btrfs_queue_worker(&fs_info->endio_write_workers,
+                                           &end_io_wq->work);
+        } else {
+                if (end_io_wq->metadata)
+                        btrfs_queue_worker(&fs_info->endio_meta_workers,
+                                           &end_io_wq->work);
+                else
+                        btrfs_queue_worker(&fs_info->endio_workers,
+                                           &end_io_wq->work);
+        }
+}
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+                        int metadata)
+{
+        struct end_io_wq *end_io_wq;
+        end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+        if (!end_io_wq)
+                return -ENOMEM;
+        end_io_wq->private = bio->bi_private;
+        end_io_wq->end_io = bio->bi_end_io;
+        end_io_wq->info = info;
+        end_io_wq->error = 0;
+        end_io_wq->bio = bio;
+        end_io_wq->metadata = metadata;
+        bio->bi_private = end_io_wq;
+        bio->bi_end_io = end_workqueue_bio;
+        return 0;
+}
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
+{
+        unsigned long limit = min_t(unsigned long,
+                                    info->workers.max_workers,
+                                    info->fs_devices->open_devices);
+        return 256 * limit;
+}
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
+{
+        return atomic_read(&info->nr_async_bios) >
+                btrfs_async_submit_limit(info);
+}
+static void run_one_async_start(struct btrfs_work *work)
+{
+        struct btrfs_fs_info *fs_info;
+        struct async_submit_bio *async;
+        async = container_of(work, struct  async_submit_bio, work);
+        fs_info = BTRFS_I(async->inode)->root->fs_info;
+        async->submit_bio_start(async->inode, async->rw, async->bio,
+                               async->mirror_num, async->bio_flags);
+}
+static void run_one_async_done(struct btrfs_work *work)
+{
+        struct btrfs_fs_info *fs_info;
+        struct async_submit_bio *async;
+        int limit;
+        async = container_of(work, struct  async_submit_bio, work);
+        fs_info = BTRFS_I(async->inode)->root->fs_info;
+        limit = btrfs_async_submit_limit(fs_info);
+        limit = limit * 2 / 3;
+        atomic_dec(&fs_info->nr_async_submits);
+        if (atomic_read(&fs_info->nr_async_submits) < limit &&
+            waitqueue_active(&fs_info->async_submit_wait))
+                wake_up(&fs_info->async_submit_wait);
+        async->submit_bio_done(async->inode, async->rw, async->bio,
+                               async->mirror_num, async->bio_flags);
+}
+static void run_one_async_free(struct btrfs_work *work)
+{
+        struct async_submit_bio *async;
+        async = container_of(work, struct  async_submit_bio, work);
+        kfree(async);
+}
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+                        int rw, struct bio *bio, int mirror_num,
+                        unsigned long bio_flags,
+                        extent_submit_bio_hook_t *submit_bio_start,
+                        extent_submit_bio_hook_t *submit_bio_done)
+{
+        struct async_submit_bio *async;
+        async = kmalloc(sizeof(*async), GFP_NOFS);
+        if (!async)
+                return -ENOMEM;
+        async->inode = inode;
+        async->rw = rw;
+        async->bio = bio;
+        async->mirror_num = mirror_num;
+        async->submit_bio_start = submit_bio_start;
+        async->submit_bio_done = submit_bio_done;
+        async->work.func = run_one_async_start;
+        async->work.ordered_func = run_one_async_done;
+        async->work.ordered_free = run_one_async_free;
+        async->work.flags = 0;
+        async->bio_flags = bio_flags;
+        atomic_inc(&fs_info->nr_async_submits);
+        btrfs_queue_worker(&fs_info->workers, &async->work);
+#if 0
+        int limit = btrfs_async_submit_limit(fs_info);
+        if (atomic_read(&fs_info->nr_async_submits) > limit) {
+                wait_event_timeout(fs_info->async_submit_wait,
+                           (atomic_read(&fs_info->nr_async_submits) < limit),
+                           HZ/10);
+                wait_event_timeout(fs_info->async_submit_wait,
+                           (atomic_read(&fs_info->nr_async_bios) < limit),
+                           HZ/10);
+        }
+#endif
+        while (atomic_read(&fs_info->async_submit_draining) &&
+              atomic_read(&fs_info->nr_async_submits)) {
+                wait_event(fs_info->async_submit_wait,
+                           (atomic_read(&fs_info->nr_async_submits) == 0));
+        }
+        return 0;
+}
+static int btree_csum_one_bio(struct bio *bio)
+{
+        struct bio_vec *bvec = bio->bi_io_vec;
+        int bio_index = 0;
+        struct btrfs_root *root;
+        WARN_ON(bio->bi_vcnt <= 0);
+        while (bio_index < bio->bi_vcnt) {
+                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+                csum_dirty_buffer(root, bvec->bv_page);
+                bio_index++;
+                bvec++;
+        }
+        return 0;
+}
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+                                    struct bio *bio, int mirror_num,
+                                    unsigned long bio_flags)
+{
+        /*
+         * when we're called for a write, we're already in the async
+         * submission context.  Just jump into btrfs_map_bio
+         */
+        btree_csum_one_bio(bio);
+        return 0;
+}
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags)
+{
+        /*
+         * when we're called for a write, we're already in the async
+         * submission context.  Just jump into btrfs_map_bio
+         */
+        return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+}
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+                                 int mirror_num, unsigned long bio_flags)
+{
+        int ret;
+        ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+                                          bio, 1);
+        BUG_ON(ret);
+        if (!(rw & (1 << BIO_RW))) {
+                /*
+                 * called for a read, do the setup so that checksum validation
+                 * can happen in the async kernel threads
+                 */
+                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                     mirror_num, 0);
+        }
+        /*
+         * kthread helpers are used to submit writes so that checksumming
+         * can happen in parallel across all CPUs
+         */
+        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                   inode, rw, bio, mirror_num, 0,
+                                   __btree_submit_bio_start,
+                                   __btree_submit_bio_done);
+}
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (current->flags & PF_MEMALLOC) {
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return 0;
+        }
+        return extent_write_full_page(tree, page, btree_get_extent, wbc);
+}
+static int btree_writepages(struct address_space *mapping,
+                            struct writeback_control *wbc)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(mapping->host)->io_tree;
+        if (wbc->sync_mode == WB_SYNC_NONE) {
+                u64 num_dirty;
+                u64 start = 0;
+                unsigned long thresh = 32 * 1024 * 1024;
+                if (wbc->for_kupdate)
+                        return 0;
+                num_dirty = count_range_bits(tree, &start, (u64)-1,
+                                             thresh, EXTENT_DIRTY);
+                if (num_dirty < thresh)
+                        return 0;
+        }
+        return extent_writepages(tree, mapping, btree_get_extent, wbc);
+}
+static int btree_readpage(struct file *file, struct page *page)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        return extent_read_full_page(tree, page, btree_get_extent);
+}
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
+{
+        struct extent_io_tree *tree;
+        struct extent_map_tree *map;
+        int ret;
+        if (PageWriteback(page) || PageDirty(page))
+                return 0;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        map = &BTRFS_I(page->mapping->host)->extent_tree;
+        ret = try_release_extent_state(map, tree, page, gfp_flags);
+        if (!ret)
+                return 0;
+        ret = try_release_extent_buffer(tree, page);
+        if (ret == 1) {
+                ClearPagePrivate(page);
+                set_page_private(page, 0);
+                page_cache_release(page);
+        }
+        return ret;
+}
+static void btree_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        extent_invalidatepage(tree, page, offset);
+        btree_releasepage(page, GFP_NOFS);
+        if (PagePrivate(page)) {
+                printk(KERN_WARNING "btrfs warning page private not zero "
+                       "on page %llu\n", (unsigned long long)page_offset(page));
+                ClearPagePrivate(page);
+                set_page_private(page, 0);
+                page_cache_release(page);
+        }
+}
+#if 0
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct buffer_head *bh;
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        struct buffer_head *head;
+        if (!page_has_buffers(page)) {
+                create_empty_buffers(page, root->fs_info->sb->s_blocksize,
+                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
+        }
+        head = page_buffers(page);
+        bh = head;
+        do {
+                if (buffer_dirty(bh))
+                        csum_tree_block(root, bh, 0);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return block_write_full_page(page, btree_get_block, wbc);
+}
+#endif
+static struct address_space_operations btree_aops = {
+        .readpage       = btree_readpage,
+        .writepage      = btree_writepage,
+        .writepages     = btree_writepages,
+        .releasepage    = btree_releasepage,
+        .invalidatepage = btree_invalidatepage,
+        .sync_page      = block_sync_page,
+};
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                         u64 parent_transid)
+{
+        struct extent_buffer *buf = NULL;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        int ret = 0;
+        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        if (!buf)
+                return 0;
+        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+                                 buf, 0, 0, btree_get_extent, 0);
+        free_extent_buffer(buf);
+        return ret;
+}
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+                                            u64 bytenr, u32 blocksize)
+{
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_buffer *eb;
+        eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+                                bytenr, blocksize, GFP_NOFS);
+        return eb;
+}
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+                                                 u64 bytenr, u32 blocksize)
+{
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_buffer *eb;
+        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+                                 bytenr, blocksize, NULL, GFP_NOFS);
+        return eb;
+}
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+        return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
+                                      buf->start + buf->len - 1, WB_SYNC_ALL);
+}
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+        return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+                                  buf->start, buf->start + buf->len - 1);
+}
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+                                      u32 blocksize, u64 parent_transid)
+{
+        struct extent_buffer *buf = NULL;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_io_tree *io_tree;
+        int ret;
+        io_tree = &BTRFS_I(btree_inode)->io_tree;
+        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        if (!buf)
+                return NULL;
+        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+        if (ret == 0)
+                buf->flags |= EXTENT_UPTODATE;
+        else
+                WARN_ON(1);
+        return buf;
+}
+int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                     struct extent_buffer *buf)
+{
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        if (btrfs_header_generation(buf) ==
+            root->fs_info->running_transaction->transid) {
+                WARN_ON(!btrfs_tree_locked(buf));
+                clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+                                          buf);
+        }
+        return 0;
+}
+static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+                        u32 stripesize, struct btrfs_root *root,
+                        struct btrfs_fs_info *fs_info,
+                        u64 objectid)
+{
+        root->node = NULL;
+        root->commit_root = NULL;
+        root->ref_tree = NULL;
+        root->sectorsize = sectorsize;
+        root->nodesize = nodesize;
+        root->leafsize = leafsize;
+        root->stripesize = stripesize;
+        root->ref_cows = 0;
+        root->track_dirty = 0;
+        root->fs_info = fs_info;
+        root->objectid = objectid;
+        root->last_trans = 0;
+        root->highest_inode = 0;
+        root->last_inode_alloc = 0;
+        root->name = NULL;
+        root->in_sysfs = 0;
+        INIT_LIST_HEAD(&root->dirty_list);
+        INIT_LIST_HEAD(&root->orphan_list);
+        INIT_LIST_HEAD(&root->dead_list);
+        spin_lock_init(&root->node_lock);
+        spin_lock_init(&root->list_lock);
+        mutex_init(&root->objectid_mutex);
+        mutex_init(&root->log_mutex);
+        extent_io_tree_init(&root->dirty_log_pages,
+                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+        btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+        root->ref_tree = &root->ref_tree_struct;
+        memset(&root->root_key, 0, sizeof(root->root_key));
+        memset(&root->root_item, 0, sizeof(root->root_item));
+        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+        memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+        root->defrag_trans_start = fs_info->generation;
+        init_completion(&root->kobj_unregister);
+        root->defrag_running = 0;
+        root->defrag_level = 0;
+        root->root_key.objectid = objectid;
+        root->anon_super.s_root = NULL;
+        root->anon_super.s_dev = 0;
+        INIT_LIST_HEAD(&root->anon_super.s_list);
+        INIT_LIST_HEAD(&root->anon_super.s_instances);
+        init_rwsem(&root->anon_super.s_umount);
+        return 0;
+}
+static int find_and_setup_root(struct btrfs_root *tree_root,
+                               struct btrfs_fs_info *fs_info,
+                               u64 objectid,
+                               struct btrfs_root *root)
+{
+        int ret;
+        u32 blocksize;
+        u64 generation;
+        __setup_root(tree_root->nodesize, tree_root->leafsize,
+                     tree_root->sectorsize, tree_root->stripesize,
+                     root, fs_info, objectid);
+        ret = btrfs_find_last_root(tree_root, objectid,
+                                   &root->root_item, &root->root_key);
+        BUG_ON(ret);
+        generation = btrfs_root_generation(&root->root_item);
+        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+                                     blocksize, generation);
+        BUG_ON(!root->node);
+        return 0;
+}
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info)
+{
+        struct extent_buffer *eb;
+        struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+        u64 start = 0;
+        u64 end = 0;
+        int ret;
+        if (!log_root_tree)
+                return 0;
+        while (1) {
+                ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
+                                    0, &start, &end, EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_dirty(&log_root_tree->dirty_log_pages,
+                                   start, end, GFP_NOFS);
+        }
+        eb = fs_info->log_root_tree->node;
+        WARN_ON(btrfs_header_level(eb) != 0);
+        WARN_ON(btrfs_header_nritems(eb) != 0);
+        ret = btrfs_free_reserved_extent(fs_info->tree_root,
+                                eb->start, eb->len);
+        BUG_ON(ret);
+        free_extent_buffer(eb);
+        kfree(fs_info->log_root_tree);
+        fs_info->log_root_tree = NULL;
+        return 0;
+}
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_root *root;
+        struct btrfs_root *tree_root = fs_info->tree_root;
+        root = kzalloc(sizeof(*root), GFP_NOFS);
+        if (!root)
+                return -ENOMEM;
+        __setup_root(tree_root->nodesize, tree_root->leafsize,
+                     tree_root->sectorsize, tree_root->stripesize,
+                     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+        root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+        root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+        root->ref_cows = 0;
+        root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
+                                            0, BTRFS_TREE_LOG_OBJECTID,
+                                            trans->transid, 0, 0, 0);
+        btrfs_set_header_nritems(root->node, 0);
+        btrfs_set_header_level(root->node, 0);
+        btrfs_set_header_bytenr(root->node, root->node->start);
+        btrfs_set_header_generation(root->node, trans->transid);
+        btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
+        write_extent_buffer(root->node, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(root->node),
+                            BTRFS_FSID_SIZE);
+        btrfs_mark_buffer_dirty(root->node);
+        btrfs_tree_unlock(root->node);
+        fs_info->log_root_tree = root;
+        return 0;
+}
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+                                               struct btrfs_key *location)
+{
+        struct btrfs_root *root;
+        struct btrfs_fs_info *fs_info = tree_root->fs_info;
+        struct btrfs_path *path;
+        struct extent_buffer *l;
+        u64 highest_inode;
+        u64 generation;
+        u32 blocksize;
+        int ret = 0;
+        root = kzalloc(sizeof(*root), GFP_NOFS);
+        if (!root)
+                return ERR_PTR(-ENOMEM);
+        if (location->offset == (u64)-1) {
+                ret = find_and_setup_root(tree_root, fs_info,
+                                          location->objectid, root);
+                if (ret) {
+                        kfree(root);
+                        return ERR_PTR(ret);
+                }
+                goto insert;
+        }
+        __setup_root(tree_root->nodesize, tree_root->leafsize,
+                     tree_root->sectorsize, tree_root->stripesize,
+                     root, fs_info, location->objectid);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+        if (ret != 0) {
+                if (ret > 0)
+                        ret = -ENOENT;
+                goto out;
+        }
+        l = path->nodes[0];
+        read_extent_buffer(l, &root->root_item,
+               btrfs_item_ptr_offset(l, path->slots[0]),
+               sizeof(root->root_item));
+        memcpy(&root->root_key, location, sizeof(*location));
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        if (ret) {
+                kfree(root);
+                return ERR_PTR(ret);
+        }
+        generation = btrfs_root_generation(&root->root_item);
+        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+                                     blocksize, generation);
+        BUG_ON(!root->node);
+insert:
+        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+                root->ref_cows = 1;
+                ret = btrfs_find_highest_inode(root, &highest_inode);
+                if (ret == 0) {
+                        root->highest_inode = highest_inode;
+                        root->last_inode_alloc = highest_inode;
+                }
+        }
+        return root;
+}
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_objectid)
+{
+        struct btrfs_root *root;
+        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+                return fs_info->tree_root;
+        if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+                return fs_info->extent_root;
+        root = radix_tree_lookup(&fs_info->fs_roots_radix,
+                                 (unsigned long)root_objectid);
+        return root;
+}
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+                                              struct btrfs_key *location)
+{
+        struct btrfs_root *root;
+        int ret;
+        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+                return fs_info->tree_root;
+        if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+                return fs_info->extent_root;
+        if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+                return fs_info->chunk_root;
+        if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+                return fs_info->dev_root;
+        if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+                return fs_info->csum_root;
+        root = radix_tree_lookup(&fs_info->fs_roots_radix,
+                                 (unsigned long)location->objectid);
+        if (root)
+                return root;
+        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
+        if (IS_ERR(root))
+                return root;
+        set_anon_super(&root->anon_super, NULL);
+        ret = radix_tree_insert(&fs_info->fs_roots_radix,
+                                (unsigned long)root->root_key.objectid,
+                                root);
+        if (ret) {
+                free_extent_buffer(root->node);
+                kfree(root);
+                return ERR_PTR(ret);
+        }
+        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+                ret = btrfs_find_dead_roots(fs_info->tree_root,
+                                            root->root_key.objectid, root);
+                BUG_ON(ret);
+                btrfs_orphan_cleanup(root);
+        }
+        return root;
+}
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_key *location,
+                                      const char *name, int namelen)
+{
+        struct btrfs_root *root;
+        int ret;
+        root = btrfs_read_fs_root_no_name(fs_info, location);
+        if (!root)
+                return NULL;
+        if (root->in_sysfs)
+                return root;
+        ret = btrfs_set_root_name(root, name, namelen);
+        if (ret) {
+                free_extent_buffer(root->node);
+                kfree(root);
+                return ERR_PTR(ret);
+        }
+#if 0
+        ret = btrfs_sysfs_add_root(root);
+        if (ret) {
+                free_extent_buffer(root->node);
+                kfree(root->name);
+                kfree(root);
+                return ERR_PTR(ret);
+        }
+#endif
+        root->in_sysfs = 1;
+        return root;
+}
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+        struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+        int ret = 0;
+        struct list_head *cur;
+        struct btrfs_device *device;
+        struct backing_dev_info *bdi;
+#if 0
+        if ((bdi_bits & (1 << BDI_write_congested)) &&
+            btrfs_congested_async(info, 0))
+                return 1;
+#endif
+        list_for_each(cur, &info->fs_devices->devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (!device->bdev)
+                        continue;
+                bdi = blk_get_backing_dev_info(device->bdev);
+                if (bdi && bdi_congested(bdi, bdi_bits)) {
+                        ret = 1;
+                        break;
+                }
+        }
+        return ret;
+}
+/*
+ * this unplugs every device on the box, and it is only used when page
+ * is null
+ */
+static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+        struct list_head *cur;
+        struct btrfs_device *device;
+        struct btrfs_fs_info *info;
+        info = (struct btrfs_fs_info *)bdi->unplug_io_data;
+        list_for_each(cur, &info->fs_devices->devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (!device->bdev)
+                        continue;
+                bdi = blk_get_backing_dev_info(device->bdev);
+                if (bdi->unplug_io_fn)
+                        bdi->unplug_io_fn(bdi, page);
+        }
+}
+static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+        struct inode *inode;
+        struct extent_map_tree *em_tree;
+        struct extent_map *em;
+        struct address_space *mapping;
+        u64 offset;
+        /* the generic O_DIRECT read code does this */
+        if (1 || !page) {
+                __unplug_io_fn(bdi, page);
+                return;
+        }
+        /*
+         * page->mapping may change at any time.  Get a consistent copy
+         * and use that for everything below
+         */
+        smp_mb();
+        mapping = page->mapping;
+        if (!mapping)
+                return;
+        inode = mapping->host;
+        /*
+         * don't do the expensive searching for a small number of
+         * devices
+         */
+        if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
+                __unplug_io_fn(bdi, page);
+                return;
+        }
+        offset = page_offset(page);
+        em_tree = &BTRFS_I(inode)->extent_tree;
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+        spin_unlock(&em_tree->lock);
+        if (!em) {
+                __unplug_io_fn(bdi, page);
+                return;
+        }
+        if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+                free_extent_map(em);
+                __unplug_io_fn(bdi, page);
+                return;
+        }
+        offset = offset - em->start;
+        btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
+                          em->block_start + offset, page);
+        free_extent_map(em);
+}
+static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+{
+        bdi_init(bdi);
+        bdi->ra_pages   = default_backing_dev_info.ra_pages;
+        bdi->state              = 0;
+        bdi->capabilities       = default_backing_dev_info.capabilities;
+        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
+        bdi->unplug_io_data     = info;
+        bdi->congested_fn       = btrfs_congested_fn;
+        bdi->congested_data     = info;
+        return 0;
+}
+static int bio_ready_for_csum(struct bio *bio)
+{
+        u64 length = 0;
+        u64 buf_len = 0;
+        u64 start = 0;
+        struct page *page;
+        struct extent_io_tree *io_tree = NULL;
+        struct btrfs_fs_info *info = NULL;
+        struct bio_vec *bvec;
+        int i;
+        int ret;
+        bio_for_each_segment(bvec, bio, i) {
+                page = bvec->bv_page;
+                if (page->private == EXTENT_PAGE_PRIVATE) {
+                        length += bvec->bv_len;
+                        continue;
+                }
+                if (!page->private) {
+                        length += bvec->bv_len;
+                        continue;
+                }
+                length = bvec->bv_len;
+                buf_len = page->private >> 2;
+                start = page_offset(page) + bvec->bv_offset;
+                io_tree = &BTRFS_I(page->mapping->host)->io_tree;
+                info = BTRFS_I(page->mapping->host)->root->fs_info;
+        }
+        /* are we fully contained in this bio? */
+        if (buf_len <= length)
+                return 1;
+        ret = extent_range_uptodate(io_tree, start + length,
+                                    start + buf_len - 1);
+        if (ret == 1)
+                return ret;
+        return ret;
+}
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions.  This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
+{
+        struct bio *bio;
+        struct end_io_wq *end_io_wq;
+        struct btrfs_fs_info *fs_info;
+        int error;
+        end_io_wq = container_of(work, struct end_io_wq, work);
+        bio = end_io_wq->bio;
+        fs_info = end_io_wq->info;
+        /* metadata bio reads are special because the whole tree block must
+         * be checksummed at once.  This makes sure the entire block is in
+         * ram and up to date before trying to verify things.  For
+         * blocksize <= pagesize, it is basically a noop
+         */
+        if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
+            !bio_ready_for_csum(bio)) {
+                btrfs_queue_worker(&fs_info->endio_meta_workers,
+                                   &end_io_wq->work);
+                return;
+        }
+        error = end_io_wq->error;
+        bio->bi_private = end_io_wq->private;
+        bio->bi_end_io = end_io_wq->end_io;
+        kfree(end_io_wq);
+        bio_endio(bio, error);
+}
+static int cleaner_kthread(void *arg)
+{
+        struct btrfs_root *root = arg;
+        do {
+                smp_mb();
+                if (root->fs_info->closing)
+                        break;
+                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+                mutex_lock(&root->fs_info->cleaner_mutex);
+                btrfs_clean_old_snapshots(root);
+                mutex_unlock(&root->fs_info->cleaner_mutex);
+                if (freezing(current)) {
+                        refrigerator();
+                } else {
+                        smp_mb();
+                        if (root->fs_info->closing)
+                                break;
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        schedule();
+                        __set_current_state(TASK_RUNNING);
+                }
+        } while (!kthread_should_stop());
+        return 0;
+}
+static int transaction_kthread(void *arg)
+{
+        struct btrfs_root *root = arg;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_transaction *cur;
+        unsigned long now;
+        unsigned long delay;
+        int ret;
+        do {
+                smp_mb();
+                if (root->fs_info->closing)
+                        break;
+                delay = HZ * 30;
+                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+                mutex_lock(&root->fs_info->transaction_kthread_mutex);
+                if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
+                        printk(KERN_INFO "btrfs: total reference cache "
+                               "size %llu\n",
+                               root->fs_info->total_ref_cache_size);
+                }
+                mutex_lock(&root->fs_info->trans_mutex);
+                cur = root->fs_info->running_transaction;
+                if (!cur) {
+                        mutex_unlock(&root->fs_info->trans_mutex);
+                        goto sleep;
+                }
+                now = get_seconds();
+                if (now < cur->start_time || now - cur->start_time < 30) {
+                        mutex_unlock(&root->fs_info->trans_mutex);
+                        delay = HZ * 5;
+                        goto sleep;
+                }
+                mutex_unlock(&root->fs_info->trans_mutex);
+                trans = btrfs_start_transaction(root, 1);
+                ret = btrfs_commit_transaction(trans, root);
+sleep:
+                wake_up_process(root->fs_info->cleaner_kthread);
+                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+                if (freezing(current)) {
+                        refrigerator();
+                } else {
+                        if (root->fs_info->closing)
+                                break;
+                        set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(delay);
+                        __set_current_state(TASK_RUNNING);
+                }
+        } while (!kthread_should_stop());
+        return 0;
+}
+struct btrfs_root *open_ctree(struct super_block *sb,
+                              struct btrfs_fs_devices *fs_devices,
+                              char *options)
+{
+        u32 sectorsize;
+        u32 nodesize;
+        u32 leafsize;
+        u32 blocksize;
+        u32 stripesize;
+        u64 generation;
+        u64 features;
+        struct btrfs_key location;
+        struct buffer_head *bh;
+        struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
+                                                 GFP_NOFS);
+        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
+                                                 GFP_NOFS);
+        struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
+                                               GFP_NOFS);
+        struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
+                                                GFP_NOFS);
+        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
+                                                GFP_NOFS);
+        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
+                                              GFP_NOFS);
+        struct btrfs_root *log_tree_root;
+        int ret;
+        int err = -EINVAL;
+        struct btrfs_super_block *disk_super;
+        if (!extent_root || !tree_root || !fs_info ||
+            !chunk_root || !dev_root || !csum_root) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+        INIT_LIST_HEAD(&fs_info->trans_list);
+        INIT_LIST_HEAD(&fs_info->dead_roots);
+        INIT_LIST_HEAD(&fs_info->hashers);
+        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+        spin_lock_init(&fs_info->hash_lock);
+        spin_lock_init(&fs_info->delalloc_lock);
+        spin_lock_init(&fs_info->new_trans_lock);
+        spin_lock_init(&fs_info->ref_cache_lock);
+        init_completion(&fs_info->kobj_unregister);
+        fs_info->tree_root = tree_root;
+        fs_info->extent_root = extent_root;
+        fs_info->csum_root = csum_root;
+        fs_info->chunk_root = chunk_root;
+        fs_info->dev_root = dev_root;
+        fs_info->fs_devices = fs_devices;
+        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+        INIT_LIST_HEAD(&fs_info->space_info);
+        btrfs_mapping_init(&fs_info->mapping_tree);
+        atomic_set(&fs_info->nr_async_submits, 0);
+        atomic_set(&fs_info->async_delalloc_pages, 0);
+        atomic_set(&fs_info->async_submit_draining, 0);
+        atomic_set(&fs_info->nr_async_bios, 0);
+        atomic_set(&fs_info->throttles, 0);
+        atomic_set(&fs_info->throttle_gen, 0);
+        fs_info->sb = sb;
+        fs_info->max_extent = (u64)-1;
+        fs_info->max_inline = 8192 * 1024;
+        setup_bdi(fs_info, &fs_info->bdi);
+        fs_info->btree_inode = new_inode(sb);
+        fs_info->btree_inode->i_ino = 1;
+        fs_info->btree_inode->i_nlink = 1;
+        fs_info->thread_pool_size = min_t(unsigned long,
+                                          num_online_cpus() + 2, 8);
+        INIT_LIST_HEAD(&fs_info->ordered_extents);
+        spin_lock_init(&fs_info->ordered_extent_lock);
+        sb->s_blocksize = 4096;
+        sb->s_blocksize_bits = blksize_bits(4096);
+        /*
+         * we set the i_size on the btree inode to the max possible int.
+         * the real end of the address space is determined by all of
+         * the devices in the system
+         */
+        fs_info->btree_inode->i_size = OFFSET_MAX;
+        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+                             fs_info->btree_inode->i_mapping,
+                             GFP_NOFS);
+        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+                             GFP_NOFS);
+        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+        spin_lock_init(&fs_info->block_group_cache_lock);
+        fs_info->block_group_cache_tree.rb_node = NULL;
+        extent_io_tree_init(&fs_info->pinned_extents,
+                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+        extent_io_tree_init(&fs_info->pending_del,
+                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+        extent_io_tree_init(&fs_info->extent_ins,
+                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+        fs_info->do_barriers = 1;
+        INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
+        btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
+        btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
+        BTRFS_I(fs_info->btree_inode)->root = tree_root;
+        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+               sizeof(struct btrfs_key));
+        insert_inode_hash(fs_info->btree_inode);
+        mutex_init(&fs_info->trans_mutex);
+        mutex_init(&fs_info->tree_log_mutex);
+        mutex_init(&fs_info->drop_mutex);
+        mutex_init(&fs_info->extent_ins_mutex);
+        mutex_init(&fs_info->pinned_mutex);
+        mutex_init(&fs_info->chunk_mutex);
+        mutex_init(&fs_info->transaction_kthread_mutex);
+        mutex_init(&fs_info->cleaner_mutex);
+        mutex_init(&fs_info->volume_mutex);
+        mutex_init(&fs_info->tree_reloc_mutex);
+        init_waitqueue_head(&fs_info->transaction_throttle);
+        init_waitqueue_head(&fs_info->transaction_wait);
+        init_waitqueue_head(&fs_info->async_submit_wait);
+        init_waitqueue_head(&fs_info->tree_log_wait);
+        atomic_set(&fs_info->tree_log_commit, 0);
+        atomic_set(&fs_info->tree_log_writers, 0);
+        fs_info->tree_log_transid = 0;
+        __setup_root(4096, 4096, 4096, 4096, tree_root,
+                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
+        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
+        if (!bh)
+                goto fail_iput;
+        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
+               sizeof(fs_info->super_for_commit));
+        brelse(bh);
+        memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+        disk_super = &fs_info->super_copy;
+        if (!btrfs_super_root(disk_super))
+                goto fail_iput;
+        ret = btrfs_parse_options(tree_root, options);
+        if (ret) {
+                err = ret;
+                goto fail_iput;
+        }
+        features = btrfs_super_incompat_flags(disk_super) &
+                ~BTRFS_FEATURE_INCOMPAT_SUPP;
+        if (features) {
+                printk(KERN_ERR "BTRFS: couldn't mount because of "
+                       "unsupported optional features (%Lx).\n",
+                       features);
+                err = -EINVAL;
+                goto fail_iput;
+        }
+        features = btrfs_super_compat_ro_flags(disk_super) &
+                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
+        if (!(sb->s_flags & MS_RDONLY) && features) {
+                printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
+                       "unsupported option features (%Lx).\n",
+                       features);
+                err = -EINVAL;
+                goto fail_iput;
+        }
+        /*
+         * we need to start all the end_io workers up front because the
+         * queue work function gets called at interrupt time, and so it
+         * cannot dynamically grow.
+         */
+        btrfs_init_workers(&fs_info->workers, "worker",
+                           fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+                           fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->submit_workers, "submit",
+                           min_t(u64, fs_devices->num_devices,
+                           fs_info->thread_pool_size));
+        /* a higher idle thresh on the submit workers makes it much more
+         * likely that bios will be send down in a sane order to the
+         * devices
+         */
+        fs_info->submit_workers.idle_thresh = 64;
+        fs_info->workers.idle_thresh = 16;
+        fs_info->workers.ordered = 1;
+        fs_info->delalloc_workers.idle_thresh = 2;
+        fs_info->delalloc_workers.ordered = 1;
+        btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+        btrfs_init_workers(&fs_info->endio_workers, "endio",
+                           fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+                           fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->endio_meta_write_workers,
+                           "endio-meta-write", fs_info->thread_pool_size);
+        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+                           fs_info->thread_pool_size);
+        /*
+         * endios are largely parallel and should have a very
+         * low idle thresh
+         */
+        fs_info->endio_workers.idle_thresh = 4;
+        fs_info->endio_write_workers.idle_thresh = 64;
+        fs_info->endio_meta_write_workers.idle_thresh = 64;
+        btrfs_start_workers(&fs_info->workers, 1);
+        btrfs_start_workers(&fs_info->submit_workers, 1);
+        btrfs_start_workers(&fs_info->delalloc_workers, 1);
+        btrfs_start_workers(&fs_info->fixup_workers, 1);
+        btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+        btrfs_start_workers(&fs_info->endio_meta_workers,
+                            fs_info->thread_pool_size);
+        btrfs_start_workers(&fs_info->endio_meta_write_workers,
+                            fs_info->thread_pool_size);
+        btrfs_start_workers(&fs_info->endio_write_workers,
+                            fs_info->thread_pool_size);
+        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+                                    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+        nodesize = btrfs_super_nodesize(disk_super);
+        leafsize = btrfs_super_leafsize(disk_super);
+        sectorsize = btrfs_super_sectorsize(disk_super);
+        stripesize = btrfs_super_stripesize(disk_super);
+        tree_root->nodesize = nodesize;
+        tree_root->leafsize = leafsize;
+        tree_root->sectorsize = sectorsize;
+        tree_root->stripesize = stripesize;
+        sb->s_blocksize = sectorsize;
+        sb->s_blocksize_bits = blksize_bits(sectorsize);
+        if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+                    sizeof(disk_super->magic))) {
+                printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+                goto fail_sb_buffer;
+        }
+        mutex_lock(&fs_info->chunk_mutex);
+        ret = btrfs_read_sys_array(tree_root);
+        mutex_unlock(&fs_info->chunk_mutex);
+        if (ret) {
+                printk(KERN_WARNING "btrfs: failed to read the system "
+                       "array on %s\n", sb->s_id);
+                goto fail_sys_array;
+        }
+        blocksize = btrfs_level_size(tree_root,
+                                     btrfs_super_chunk_root_level(disk_super));
+        generation = btrfs_super_chunk_root_generation(disk_super);
+        __setup_root(nodesize, leafsize, sectorsize, stripesize,
+                     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+        chunk_root->node = read_tree_block(chunk_root,
+                                           btrfs_super_chunk_root(disk_super),
+                                           blocksize, generation);
+        BUG_ON(!chunk_root->node);
+        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+           BTRFS_UUID_SIZE);
+        mutex_lock(&fs_info->chunk_mutex);
+        ret = btrfs_read_chunk_tree(chunk_root);
+        mutex_unlock(&fs_info->chunk_mutex);
+        if (ret) {
+                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+                       sb->s_id);
+                goto fail_chunk_root;
+        }
+        btrfs_close_extra_devices(fs_devices);
+        blocksize = btrfs_level_size(tree_root,
+                                     btrfs_super_root_level(disk_super));
+        generation = btrfs_super_generation(disk_super);
+        tree_root->node = read_tree_block(tree_root,
+                                          btrfs_super_root(disk_super),
+                                          blocksize, generation);
+        if (!tree_root->node)
+                goto fail_chunk_root;
+        ret = find_and_setup_root(tree_root, fs_info,
+                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
+        if (ret)
+                goto fail_tree_root;
+        extent_root->track_dirty = 1;
+        ret = find_and_setup_root(tree_root, fs_info,
+                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
+        dev_root->track_dirty = 1;
+        if (ret)
+                goto fail_extent_root;
+        ret = find_and_setup_root(tree_root, fs_info,
+                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
+        if (ret)
+                goto fail_extent_root;
+        csum_root->track_dirty = 1;
+        btrfs_read_block_groups(extent_root);
+        fs_info->generation = generation;
+        fs_info->last_trans_committed = generation;
+        fs_info->data_alloc_profile = (u64)-1;
+        fs_info->metadata_alloc_profile = (u64)-1;
+        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+                                               "btrfs-cleaner");
+        if (!fs_info->cleaner_kthread)
+                goto fail_csum_root;
+        fs_info->transaction_kthread = kthread_run(transaction_kthread,
+                                                   tree_root,
+                                                   "btrfs-transaction");
+        if (!fs_info->transaction_kthread)
+                goto fail_cleaner;
+        if (btrfs_super_log_root(disk_super) != 0) {
+                u64 bytenr = btrfs_super_log_root(disk_super);
+                if (fs_devices->rw_devices == 0) {
+                        printk(KERN_WARNING "Btrfs log replay required "
+                               "on RO media\n");
+                        err = -EIO;
+                        goto fail_trans_kthread;
+                }
+                blocksize =
+                     btrfs_level_size(tree_root,
+                                      btrfs_super_log_root_level(disk_super));
+                log_tree_root = kzalloc(sizeof(struct btrfs_root),
+                                                      GFP_NOFS);
+                __setup_root(nodesize, leafsize, sectorsize, stripesize,
+                             log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+                log_tree_root->node = read_tree_block(tree_root, bytenr,
+                                                      blocksize,
+                                                      generation + 1);
+                ret = btrfs_recover_log_trees(log_tree_root);
+                BUG_ON(ret);
+                if (sb->s_flags & MS_RDONLY) {
+                        ret =  btrfs_commit_super(tree_root);
+                        BUG_ON(ret);
+                }
+        }
+        if (!(sb->s_flags & MS_RDONLY)) {
+                ret = btrfs_cleanup_reloc_trees(tree_root);
+                BUG_ON(ret);
+        }
+        location.objectid = BTRFS_FS_TREE_OBJECTID;
+        location.type = BTRFS_ROOT_ITEM_KEY;
+        location.offset = (u64)-1;
+        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+        if (!fs_info->fs_root)
+                goto fail_trans_kthread;
+        return tree_root;
+fail_trans_kthread:
+        kthread_stop(fs_info->transaction_kthread);
+fail_cleaner:
+        kthread_stop(fs_info->cleaner_kthread);
+        /*
+         * make sure we're done with the btree inode before we stop our
+         * kthreads
+         */
+        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+fail_csum_root:
+        free_extent_buffer(csum_root->node);
+fail_extent_root:
+        free_extent_buffer(extent_root->node);
+fail_tree_root:
+        free_extent_buffer(tree_root->node);
+fail_chunk_root:
+        free_extent_buffer(chunk_root->node);
+fail_sys_array:
+        free_extent_buffer(dev_root->node);
+fail_sb_buffer:
+        btrfs_stop_workers(&fs_info->fixup_workers);
+        btrfs_stop_workers(&fs_info->delalloc_workers);
+        btrfs_stop_workers(&fs_info->workers);
+        btrfs_stop_workers(&fs_info->endio_workers);
+        btrfs_stop_workers(&fs_info->endio_meta_workers);
+        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+        btrfs_stop_workers(&fs_info->endio_write_workers);
+        btrfs_stop_workers(&fs_info->submit_workers);
+fail_iput:
+        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+        iput(fs_info->btree_inode);
+fail:
+        btrfs_close_devices(fs_info->fs_devices);
+        btrfs_mapping_tree_free(&fs_info->mapping_tree);
+        kfree(extent_root);
+        kfree(tree_root);
+        bdi_destroy(&fs_info->bdi);
+        kfree(fs_info);
+        kfree(chunk_root);
+        kfree(dev_root);
+        kfree(csum_root);
+        return ERR_PTR(err);
+}
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+        char b[BDEVNAME_SIZE];
+        if (uptodate) {
+                set_buffer_uptodate(bh);
+        } else {
+                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+                        printk(KERN_WARNING "lost page write due to "
+                                        "I/O error on %s\n",
+                                       bdevname(bh->b_bdev, b));
+                }
+                /* note, we dont' set_buffer_write_io_error because we have
+                 * our own ways of dealing with the IO errors
+                 */
+                clear_buffer_uptodate(bh);
+        }
+        unlock_buffer(bh);
+        put_bh(bh);
+}
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+        struct buffer_head *bh;
+        struct buffer_head *latest = NULL;
+        struct btrfs_super_block *super;
+        int i;
+        u64 transid = 0;
+        u64 bytenr;
+        /* we would like to check all the supers, but that would make
+         * a btrfs mount succeed after a mkfs from a different FS.
+         * So, we need to add a special mount option to scan for
+         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+         */
+        for (i = 0; i < 1; i++) {
+                bytenr = btrfs_sb_offset(i);
+                if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+                        break;
+                bh = __bread(bdev, bytenr / 4096, 4096);
+                if (!bh)
+                        continue;
+                super = (struct btrfs_super_block *)bh->b_data;
+                if (btrfs_super_bytenr(super) != bytenr ||
+                    strncmp((char *)(&super->magic), BTRFS_MAGIC,
+                            sizeof(super->magic))) {
+                        brelse(bh);
+                        continue;
+                }
+                if (!latest || btrfs_super_generation(super) > transid) {
+                        brelse(latest);
+                        latest = bh;
+                        transid = btrfs_super_generation(super);
+                } else {
+                        brelse(bh);
+                }
+        }
+        return latest;
+}
+static int write_dev_supers(struct btrfs_device *device,
+                            struct btrfs_super_block *sb,
+                            int do_barriers, int wait, int max_mirrors)
+{
+        struct buffer_head *bh;
+        int i;
+        int ret;
+        int errors = 0;
+        u32 crc;
+        u64 bytenr;
+        int last_barrier = 0;
+        if (max_mirrors == 0)
+                max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+        /* make sure only the last submit_bh does a barrier */
+        if (do_barriers) {
+                for (i = 0; i < max_mirrors; i++) {
+                        bytenr = btrfs_sb_offset(i);
+                        if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+                            device->total_bytes)
+                                break;
+                        last_barrier = i;
+                }
+        }
+        for (i = 0; i < max_mirrors; i++) {
+                bytenr = btrfs_sb_offset(i);
+                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+                        break;
+                if (wait) {
+                        bh = __find_get_block(device->bdev, bytenr / 4096,
+                                              BTRFS_SUPER_INFO_SIZE);
+                        BUG_ON(!bh);
+                        brelse(bh);
+                        wait_on_buffer(bh);
+                        if (buffer_uptodate(bh)) {
+                                brelse(bh);
+                                continue;
+                        }
+                } else {
+                        btrfs_set_super_bytenr(sb, bytenr);
+                        crc = ~(u32)0;
+                        crc = btrfs_csum_data(NULL, (char *)sb +
+                                              BTRFS_CSUM_SIZE, crc,
+                                              BTRFS_SUPER_INFO_SIZE -
+                                              BTRFS_CSUM_SIZE);
+                        btrfs_csum_final(crc, sb->csum);
+                        bh = __getblk(device->bdev, bytenr / 4096,
+                                      BTRFS_SUPER_INFO_SIZE);
+                        memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+                        set_buffer_uptodate(bh);
+                        get_bh(bh);
+                        lock_buffer(bh);
+                        bh->b_end_io = btrfs_end_buffer_write_sync;
+                }
+                if (i == last_barrier && do_barriers && device->barriers) {
+                        ret = submit_bh(WRITE_BARRIER, bh);
+                        if (ret == -EOPNOTSUPP) {
+                                printk("btrfs: disabling barriers on dev %s\n",
+                                       device->name);
+                                set_buffer_uptodate(bh);
+                                device->barriers = 0;
+                                get_bh(bh);
+                                lock_buffer(bh);
+                                ret = submit_bh(WRITE, bh);
+                        }
+                } else {
+                        ret = submit_bh(WRITE, bh);
+                }
+                if (!ret && wait) {
+                        wait_on_buffer(bh);
+                        if (!buffer_uptodate(bh))
+                                errors++;
+                } else if (ret) {
+                        errors++;
+                }
+                if (wait)
+                        brelse(bh);
+        }
+        return errors < i ? 0 : -1;
+}
+int write_all_supers(struct btrfs_root *root, int max_mirrors)
+{
+        struct list_head *cur;
+        struct list_head *head = &root->fs_info->fs_devices->devices;
+        struct btrfs_device *dev;
+        struct btrfs_super_block *sb;
+        struct btrfs_dev_item *dev_item;
+        int ret;
+        int do_barriers;
+        int max_errors;
+        int total_errors = 0;
+        u64 flags;
+        max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+        do_barriers = !btrfs_test_opt(root, NOBARRIER);
+        sb = &root->fs_info->super_for_commit;
+        dev_item = &sb->dev_item;
+        list_for_each(cur, head) {
+                dev = list_entry(cur, struct btrfs_device, dev_list);
+                if (!dev->bdev) {
+                        total_errors++;
+                        continue;
+                }
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                btrfs_set_stack_device_generation(dev_item, 0);
+                btrfs_set_stack_device_type(dev_item, dev->type);
+                btrfs_set_stack_device_id(dev_item, dev->devid);
+                btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+                btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+                btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+                btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+                btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+                memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+                memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+                flags = btrfs_super_flags(sb);
+                btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+                ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
+                if (ret)
+                        total_errors++;
+        }
+        if (total_errors > max_errors) {
+                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+                       total_errors);
+                BUG();
+        }
+        total_errors = 0;
+        list_for_each(cur, head) {
+                dev = list_entry(cur, struct btrfs_device, dev_list);
+                if (!dev->bdev)
+                        continue;
+                if (!dev->in_fs_metadata || !dev->writeable)
+                        continue;
+                ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+                if (ret)
+                        total_errors++;
+        }
+        if (total_errors > max_errors) {
+                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+                       total_errors);
+                BUG();
+        }
+        return 0;
+}
+int write_ctree_super(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, int max_mirrors)
+{
+        int ret;
+        ret = write_all_supers(root, max_mirrors);
+        return ret;
+}
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+        radix_tree_delete(&fs_info->fs_roots_radix,
+                          (unsigned long)root->root_key.objectid);
+        if (root->anon_super.s_dev) {
+                down_write(&root->anon_super.s_umount);
+                kill_anon_super(&root->anon_super);
+        }
+        if (root->node)
+                free_extent_buffer(root->node);
+        if (root->commit_root)
+                free_extent_buffer(root->commit_root);
+        kfree(root->name);
+        kfree(root);
+        return 0;
+}
+static int del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+        int ret;
+        struct btrfs_root *gang[8];
+        int i;
+        while (1) {
+                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+                                             (void **)gang, 0,
+                                             ARRAY_SIZE(gang));
+                if (!ret)
+                        break;
+                for (i = 0; i < ret; i++)
+                        btrfs_free_fs_root(fs_info, gang[i]);
+        }
+        return 0;
+}
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+{
+        u64 root_objectid = 0;
+        struct btrfs_root *gang[8];
+        int i;
+        int ret;
+        while (1) {
+                ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+                                             (void **)gang, root_objectid,
+                                             ARRAY_SIZE(gang));
+                if (!ret)
+                        break;
+                for (i = 0; i < ret; i++) {
+                        root_objectid = gang[i]->root_key.objectid;
+                        ret = btrfs_find_dead_roots(fs_info->tree_root,
+                                                    root_objectid, gang[i]);
+                        BUG_ON(ret);
+                        btrfs_orphan_cleanup(gang[i]);
+                }
+                root_objectid++;
+        }
+        return 0;
+}
+int btrfs_commit_super(struct btrfs_root *root)
+{
+        struct btrfs_trans_handle *trans;
+        int ret;
+        mutex_lock(&root->fs_info->cleaner_mutex);
+        btrfs_clean_old_snapshots(root);
+        mutex_unlock(&root->fs_info->cleaner_mutex);
+        trans = btrfs_start_transaction(root, 1);
+        ret = btrfs_commit_transaction(trans, root);
+        BUG_ON(ret);
+        /* run commit again to drop the original snapshot */
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_commit_transaction(trans, root);
+        ret = btrfs_write_and_wait_transaction(NULL, root);
+        BUG_ON(ret);
+        ret = write_ctree_super(NULL, root, 0);
+        return ret;
+}
+int close_ctree(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        int ret;
+        fs_info->closing = 1;
+        smp_mb();
+        kthread_stop(root->fs_info->transaction_kthread);
+        kthread_stop(root->fs_info->cleaner_kthread);
+        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+                ret =  btrfs_commit_super(root);
+                if (ret)
+                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+        }
+        if (fs_info->delalloc_bytes) {
+                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
+                       fs_info->delalloc_bytes);
+        }
+        if (fs_info->total_ref_cache_size) {
+                printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
+                       (unsigned long long)fs_info->total_ref_cache_size);
+        }
+        if (fs_info->extent_root->node)
+                free_extent_buffer(fs_info->extent_root->node);
+        if (fs_info->tree_root->node)
+                free_extent_buffer(fs_info->tree_root->node);
+        if (root->fs_info->chunk_root->node)
+                free_extent_buffer(root->fs_info->chunk_root->node);
+        if (root->fs_info->dev_root->node)
+                free_extent_buffer(root->fs_info->dev_root->node);
+        if (root->fs_info->csum_root->node)
+                free_extent_buffer(root->fs_info->csum_root->node);
+        btrfs_free_block_groups(root->fs_info);
+        del_fs_roots(fs_info);
+        iput(fs_info->btree_inode);
+        btrfs_stop_workers(&fs_info->fixup_workers);
+        btrfs_stop_workers(&fs_info->delalloc_workers);
+        btrfs_stop_workers(&fs_info->workers);
+        btrfs_stop_workers(&fs_info->endio_workers);
+        btrfs_stop_workers(&fs_info->endio_meta_workers);
+        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+        btrfs_stop_workers(&fs_info->endio_write_workers);
+        btrfs_stop_workers(&fs_info->submit_workers);
+#if 0
+        while (!list_empty(&fs_info->hashers)) {
+                struct btrfs_hasher *hasher;
+                hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
+                                    hashers);
+                list_del(&hasher->hashers);
+                crypto_free_hash(&fs_info->hash_tfm);
+                kfree(hasher);
+        }
+#endif
+        btrfs_close_devices(fs_info->fs_devices);
+        btrfs_mapping_tree_free(&fs_info->mapping_tree);
+        bdi_destroy(&fs_info->bdi);
+        kfree(fs_info->extent_root);
+        kfree(fs_info->tree_root);
+        kfree(fs_info->chunk_root);
+        kfree(fs_info->dev_root);
+        kfree(fs_info->csum_root);
+        return 0;
+}
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
+{
+        int ret;
+        struct inode *btree_inode = buf->first_page->mapping->host;
+        ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+        if (!ret)
+                return ret;
+        ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+                                    parent_transid);
+        return !ret;
+}
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
+{
+        struct inode *btree_inode = buf->first_page->mapping->host;
+        return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
+                                          buf);
+}
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+        u64 transid = btrfs_header_generation(buf);
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        WARN_ON(!btrfs_tree_locked(buf));
+        if (transid != root->fs_info->generation) {
+                printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+                       "found %llu running %llu\n",
+                        (unsigned long long)buf->start,
+                        (unsigned long long)transid,
+                        (unsigned long long)root->fs_info->generation);
+                WARN_ON(1);
+        }
+        set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+}
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+        /*
+         * looks as though older kernels can get into trouble with
+         * this code, they end up stuck in balance_dirty_pages forever
+         */
+        struct extent_io_tree *tree;
+        u64 num_dirty;
+        u64 start = 0;
+        unsigned long thresh = 32 * 1024 * 1024;
+        tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+        if (current_is_pdflush() || current->flags & PF_MEMALLOC)
+                return;
+        num_dirty = count_range_bits(tree, &start, (u64)-1,
+                                     thresh, EXTENT_DIRTY);
+        if (num_dirty > thresh) {
+                balance_dirty_pages_ratelimited_nr(
+                                   root->fs_info->btree_inode->i_mapping, 1);
+        }
+        return;
+}
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+{
+        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+        int ret;
+        ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+        if (ret == 0)
+                buf->flags |= EXTENT_UPTODATE;
+        return ret;
+}
+int btree_lock_page_hook(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct extent_buffer *eb;
+        unsigned long len;
+        u64 bytenr = page_offset(page);
+        if (page->private == EXTENT_PAGE_PRIVATE)
+                goto out;
+        len = page->private >> 2;
+        eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+        if (!eb)
+                goto out;
+        btrfs_tree_lock(eb);
+        spin_lock(&root->fs_info->hash_lock);
+        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+        spin_unlock(&root->fs_info->hash_lock);
+        btrfs_tree_unlock(eb);
+        free_extent_buffer(eb);
+out:
+        lock_page(page);
+        return 0;
+}
+static struct extent_io_ops btree_extent_io_ops = {
+        .write_cache_pages_lock_hook = btree_lock_page_hook,
+        .readpage_end_io_hook = btree_readpage_end_io_hook,
+        .submit_bio_hook = btree_submit_bio_hook,
+        /* note we're sharing with inode.c for the merge bio hook */
+        .merge_bio_hook = btrfs_merge_bio_hook,
+};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 000000000000..c0ff404c31b7
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DISKIO__
+#define __DISKIO__
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
+#define BTRFS_SUPER_MIRROR_MAX   3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+static inline u64 btrfs_sb_offset(int mirror)
+{
+        u64 start = 16 * 1024;
+        if (mirror)
+                return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+        return BTRFS_SUPER_INFO_OFFSET;
+}
+struct btrfs_device;
+struct btrfs_fs_devices;
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+                                      u32 blocksize, u64 parent_transid);
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                         u64 parent_transid);
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+                                                   u64 bytenr, u32 blocksize);
+int clean_tree_block(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root, struct extent_buffer *buf);
+struct btrfs_root *open_ctree(struct super_block *sb,
+                              struct btrfs_fs_devices *fs_devices,
+                              char *options);
+int close_ctree(struct btrfs_root *root);
+int write_ctree_super(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_commit_super(struct btrfs_root *root);
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+                                            u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_objectid);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+                                      struct btrfs_key *location,
+                                      const char *name, int namelen);
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+                                               struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+                                              struct btrfs_key *location);
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
+int btrfs_insert_dev_radix(struct btrfs_root *root,
+                           struct block_device *bdev,
+                           u64 device_id,
+                           u64 block_start,
+                           u64 num_blocks);
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
+int wait_on_tree_block_writeback(struct btrfs_root *root,
+                                 struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+void btrfs_csum_final(u32 crc, char *result);
+int btrfs_open_device(struct btrfs_device *dev);
+int btrfs_verify_block_csum(struct btrfs_root *root,
+                            struct extent_buffer *buf);
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+                        int metadata);
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+                        int rw, struct bio *bio, int mirror_num,
+                        unsigned long bio_flags,
+                        extent_submit_bio_hook_t *submit_bio_start,
+                        extent_submit_bio_hook_t *submit_bio_done);
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info);
+int btree_lock_page_hook(struct page *page);
+#endif
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 000000000000..85315d2c90de
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,203 @@
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+#include "compat.h"
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+                                                 parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+                                             parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
+static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+                           int connectable)
+{
+        struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+        struct inode *inode = dentry->d_inode;
+        int len = *max_len;
+        int type;
+        if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+            (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+                return 255;
+        len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
+        type = FILEID_BTRFS_WITHOUT_PARENT;
+        fid->objectid = BTRFS_I(inode)->location.objectid;
+        fid->root_objectid = BTRFS_I(inode)->root->objectid;
+        fid->gen = inode->i_generation;
+        if (connectable && !S_ISDIR(inode->i_mode)) {
+                struct inode *parent;
+                u64 parent_root_id;
+                spin_lock(&dentry->d_lock);
+                parent = dentry->d_parent->d_inode;
+                fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+                fid->parent_gen = parent->i_generation;
+                parent_root_id = BTRFS_I(parent)->root->objectid;
+                spin_unlock(&dentry->d_lock);
+                if (parent_root_id != fid->root_objectid) {
+                        fid->parent_root_objectid = parent_root_id;
+                        len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+                        type = FILEID_BTRFS_WITH_PARENT_ROOT;
+                } else {
+                        len = BTRFS_FID_SIZE_CONNECTABLE;
+                        type = FILEID_BTRFS_WITH_PARENT;
+                }
+        }
+        *max_len = len;
+        return type;
+}
+static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+                                       u64 root_objectid, u32 generation)
+{
+        struct btrfs_root *root;
+        struct inode *inode;
+        struct btrfs_key key;
+        key.objectid = root_objectid;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.offset = (u64)-1;
+        root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
+        if (IS_ERR(root))
+                return ERR_CAST(root);
+        key.objectid = objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        inode = btrfs_iget(sb, &key, root, NULL);
+        if (IS_ERR(inode))
+                return (void *)inode;
+        if (generation != inode->i_generation) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return d_obtain_alias(inode);
+}
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+                                         int fh_len, int fh_type)
+{
+        struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+        u64 objectid, root_objectid;
+        u32 generation;
+        if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+                if (fh_len !=  BTRFS_FID_SIZE_CONNECTABLE)
+                        return NULL;
+                root_objectid = fid->root_objectid;
+        } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+                if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+                        return NULL;
+                root_objectid = fid->parent_root_objectid;
+        } else
+                return NULL;
+        objectid = fid->parent_objectid;
+        generation = fid->parent_gen;
+        return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+                                         int fh_len, int fh_type)
+{
+        struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+        u64 objectid, root_objectid;
+        u32 generation;
+        if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+             fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+            (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+             fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+            (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+             fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+                return NULL;
+        objectid = fid->objectid;
+        root_objectid = fid->root_objectid;
+        generation = fid->gen;
+        return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+static struct dentry *btrfs_get_parent(struct dentry *child)
+{
+        struct inode *dir = child->d_inode;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        int slot;
+        u64 objectid;
+        int ret;
+        path = btrfs_alloc_path();
+        key.objectid = dir->i_ino;
+        btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+        key.offset = (u64)-1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                /* Error */
+                btrfs_free_path(path);
+                return ERR_PTR(ret);
+        }
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        if (ret) {
+                /* btrfs_search_slot() returns the slot where we'd want to
+                   insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
+                   The _real_ backref, telling us what the parent inode
+                   _actually_ is, will be in the slot _before_ the one
+                   that btrfs_search_slot() returns. */
+                if (!slot) {
+                        /* Unless there is _no_ key in the tree before... */
+                        btrfs_free_path(path);
+                        return ERR_PTR(-EIO);
+                }
+                slot--;
+        }
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        btrfs_free_path(path);
+        if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
+                return ERR_PTR(-EINVAL);
+        objectid = key.offset;
+        /* If we are already at the root of a subvol, return the real root */
+        if (objectid == dir->i_ino)
+                return dget(dir->i_sb->s_root);
+        /* Build a new key for the inode item */
+        key.objectid = objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+}
+const struct export_operations btrfs_export_ops = {
+        .encode_fh      = btrfs_encode_fh,
+        .fh_to_dentry   = btrfs_fh_to_dentry,
+        .fh_to_parent   = btrfs_fh_to_parent,
+        .get_parent     = btrfs_get_parent,
+};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 000000000000..074348a95841
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+#include <linux/exportfs.h>
+extern const struct export_operations btrfs_export_ops;
+struct btrfs_fid {
+        u64 objectid;
+        u64 root_objectid;
+        u32 gen;
+        u64 parent_objectid;
+        u32 parent_gen;
+        u64 parent_root_objectid;
+} __attribute__ ((packed));
+#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 000000000000..293da650873f
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,5986 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/version.h>
+#include "compat.h"
+#include "hash.h"
+#include "crc32c.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "compat.h"
+#define PENDING_EXTENT_INSERT 0
+#define PENDING_EXTENT_DELETE 1
+#define PENDING_BACKREF_UPDATE 2
+struct pending_extent_op {
+        int type;
+        u64 bytenr;
+        u64 num_bytes;
+        u64 parent;
+        u64 orig_parent;
+        u64 generation;
+        u64 orig_generation;
+        int level;
+        struct list_head list;
+        int del;
+};
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *extent_root, int all);
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root, int all);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          u64 bytenr, u64 num_bytes, int is_data);
+static int update_block_group(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              u64 bytenr, u64 num_bytes, int alloc,
+                              int mark_free);
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+        return (cache->flags & bits) == bits;
+}
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+                                struct btrfs_block_group_cache *block_group)
+{
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        struct btrfs_block_group_cache *cache;
+        spin_lock(&info->block_group_cache_lock);
+        p = &info->block_group_cache_tree.rb_node;
+        while (*p) {
+                parent = *p;
+                cache = rb_entry(parent, struct btrfs_block_group_cache,
+                                 cache_node);
+                if (block_group->key.objectid < cache->key.objectid) {
+                        p = &(*p)->rb_left;
+                } else if (block_group->key.objectid > cache->key.objectid) {
+                        p = &(*p)->rb_right;
+                } else {
+                        spin_unlock(&info->block_group_cache_lock);
+                        return -EEXIST;
+                }
+        }
+        rb_link_node(&block_group->cache_node, parent, p);
+        rb_insert_color(&block_group->cache_node,
+                        &info->block_group_cache_tree);
+        spin_unlock(&info->block_group_cache_lock);
+        return 0;
+}
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+                              int contains)
+{
+        struct btrfs_block_group_cache *cache, *ret = NULL;
+        struct rb_node *n;
+        u64 end, start;
+        spin_lock(&info->block_group_cache_lock);
+        n = info->block_group_cache_tree.rb_node;
+        while (n) {
+                cache = rb_entry(n, struct btrfs_block_group_cache,
+                                 cache_node);
+                end = cache->key.objectid + cache->key.offset - 1;
+                start = cache->key.objectid;
+                if (bytenr < start) {
+                        if (!contains && (!ret || start < ret->key.objectid))
+                                ret = cache;
+                        n = n->rb_left;
+                } else if (bytenr > start) {
+                        if (contains && bytenr <= end) {
+                                ret = cache;
+                                break;
+                        }
+                        n = n->rb_right;
+                } else {
+                        ret = cache;
+                        break;
+                }
+        }
+        if (ret)
+                atomic_inc(&ret->count);
+        spin_unlock(&info->block_group_cache_lock);
+        return ret;
+}
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_fs_info *info, u64 start, u64 end)
+{
+        u64 extent_start, extent_end, size;
+        int ret;
+        mutex_lock(&info->pinned_mutex);
+        while (start < end) {
+                ret = find_first_extent_bit(&info->pinned_extents, start,
+                                            &extent_start, &extent_end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                if (extent_start == start) {
+                        start = extent_end + 1;
+                } else if (extent_start > start && extent_start < end) {
+                        size = extent_start - start;
+                        ret = btrfs_add_free_space(block_group, start,
+                                                   size);
+                        BUG_ON(ret);
+                        start = extent_end + 1;
+                } else {
+                        break;
+                }
+        }
+        if (start < end) {
+                size = end - start;
+                ret = btrfs_add_free_space(block_group, start, size);
+                BUG_ON(ret);
+        }
+        mutex_unlock(&info->pinned_mutex);
+        return 0;
+}
+static int remove_sb_from_cache(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *cache)
+{
+        u64 bytenr;
+        u64 *logical;
+        int stripe_len;
+        int i, nr, ret;
+        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                bytenr = btrfs_sb_offset(i);
+                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+                                       cache->key.objectid, bytenr, 0,
+                                       &logical, &nr, &stripe_len);
+                BUG_ON(ret);
+                while (nr--) {
+                        btrfs_remove_free_space(cache, logical[nr],
+                                                stripe_len);
+                }
+                kfree(logical);
+        }
+        return 0;
+}
+static int cache_block_group(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_path *path;
+        int ret = 0;
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        int slot;
+        u64 last;
+        if (!block_group)
+                return 0;
+        root = root->fs_info->extent_root;
+        if (block_group->cached)
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        /*
+         * we get into deadlocks with paths held by callers of this function.
+         * since the alloc_mutex is protecting things right now, just
+         * skip the locking here
+         */
+        path->skip_locking = 1;
+        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+        key.objectid = last;
+        key.offset = 0;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto err;
+        while (1) {
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto err;
+                        if (ret == 0)
+                                continue;
+                        else
+                                break;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.objectid < block_group->key.objectid)
+                        goto next;
+                if (key.objectid >= block_group->key.objectid +
+                    block_group->key.offset)
+                        break;
+                if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
+                        add_new_free_space(block_group, root->fs_info, last,
+                                           key.objectid);
+                        last = key.objectid + key.offset;
+                }
+next:
+                path->slots[0]++;
+        }
+        add_new_free_space(block_group, root->fs_info, last,
+                           block_group->key.objectid +
+                           block_group->key.offset);
+        remove_sb_from_cache(root, block_group);
+        block_group->cached = 1;
+        ret = 0;
+err:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * return the block group that starts at or after bytenr
+ */
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = block_group_cache_tree_search(info, bytenr, 0);
+        return cache;
+}
+/*
+ * return the block group that contains teh given bytenr
+ */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+                                                 struct btrfs_fs_info *info,
+                                                 u64 bytenr)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = block_group_cache_tree_search(info, bytenr, 1);
+        return cache;
+}
+static inline void put_block_group(struct btrfs_block_group_cache *cache)
+{
+        if (atomic_dec_and_test(&cache->count))
+                kfree(cache);
+}
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+                                                  u64 flags)
+{
+        struct list_head *head = &info->space_info;
+        struct list_head *cur;
+        struct btrfs_space_info *found;
+        list_for_each(cur, head) {
+                found = list_entry(cur, struct btrfs_space_info, list);
+                if (found->flags == flags)
+                        return found;
+        }
+        return NULL;
+}
+static u64 div_factor(u64 num, int factor)
+{
+        if (factor == 10)
+                return num;
+        num *= factor;
+        do_div(num, 10);
+        return num;
+}
+u64 btrfs_find_block_group(struct btrfs_root *root,
+                           u64 search_start, u64 search_hint, int owner)
+{
+        struct btrfs_block_group_cache *cache;
+        u64 used;
+        u64 last = max(search_hint, search_start);
+        u64 group_start = 0;
+        int full_search = 0;
+        int factor = 9;
+        int wrapped = 0;
+again:
+        while (1) {
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
+                if (!cache)
+                        break;
+                spin_lock(&cache->lock);
+                last = cache->key.objectid + cache->key.offset;
+                used = btrfs_block_group_used(&cache->item);
+                if ((full_search || !cache->ro) &&
+                    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
+                        if (used + cache->pinned + cache->reserved <
+                            div_factor(cache->key.offset, factor)) {
+                                group_start = cache->key.objectid;
+                                spin_unlock(&cache->lock);
+                                put_block_group(cache);
+                                goto found;
+                        }
+                }
+                spin_unlock(&cache->lock);
+                put_block_group(cache);
+                cond_resched();
+        }
+        if (!wrapped) {
+                last = search_start;
+                wrapped = 1;
+                goto again;
+        }
+        if (!full_search && factor < 10) {
+                last = search_start;
+                full_search = 1;
+                factor = 10;
+                goto again;
+        }
+found:
+        return group_start;
+}
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = start;
+        key.offset = len;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+                                0, 0);
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * Back reference rules.  Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ *    when a reference is dropped we can make sure it was a valid reference
+ *    before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ *    if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ *    maintenance.  This is actually the same as #2, but with a slightly
+ *    different use case.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - Generation number of the tree holding the reference
+ * - objectid of the file holding the reference
+ * - number of references holding by parent node (alway 1 for tree blocks)
+ *
+ * Btree leaf may hold multiple references to a file extent. In most cases,
+ * these references are from same file and the corresponding offsets inside
+ * the file are close together.
+ *
+ * When a file extent is allocated the fields are filled in:
+ *     (root_key.objectid, trans->transid, inode objectid, 1)
+ *
+ * When a leaf is cow'd new references are added for every file extent found
+ * in the leaf.  It looks similar to the create case, but trans->transid will
+ * be different when the block is cow'd.
+ *
+ *     (root_key.objectid, trans->transid, inode objectid,
+ *      number of references in the leaf)
+ *
+ * When a file extent is removed either during snapshot deletion or
+ * file truncation, we find the corresponding back reference and check
+ * the following fields:
+ *
+ *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ *      inode objectid)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ * - Different generations of the same subvolume
+ *
+ * When a tree block is created, back references are inserted:
+ *
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a tree block is cow'd, new back references are added for all the
+ * blocks it points to. If the tree block isn't in reference counted root,
+ * the old back references are removed. These new back references are of
+ * the form (trans->transid will have increased since creation):
+ *
+ * (root->root_key.objectid, trans->transid, level, 1)
+ *
+ * When a backref is in deleting, the following fields are checked:
+ *
+ * if backref was for a tree root:
+ *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
+ * else
+ *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent, the key
+ * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * byte of parent extent. If a extent is tree root, the key offset is set
+ * to the key objectid.
+ */
+static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 ref_root, u64 ref_generation,
+                                          u64 owner_objectid, int del)
+{
+        struct btrfs_key key;
+        struct btrfs_extent_ref *ref;
+        struct extent_buffer *leaf;
+        u64 ref_objectid;
+        int ret;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_REF_KEY;
+        key.offset = parent;
+        ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        ref_objectid = btrfs_ref_objectid(leaf, ref);
+        if (btrfs_ref_root(leaf, ref) != ref_root ||
+            btrfs_ref_generation(leaf, ref) != ref_generation ||
+            (ref_objectid != owner_objectid &&
+             ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+                ret = -EIO;
+                WARN_ON(1);
+                goto out;
+        }
+        ret = 0;
+out:
+        return ret;
+}
+/*
+ * updates all the backrefs that are pending on update_list for the
+ * extent_root
+ */
+static noinline int update_backrefs(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *extent_root,
+                                    struct btrfs_path *path,
+                                    struct list_head *update_list)
+{
+        struct btrfs_key key;
+        struct btrfs_extent_ref *ref;
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct pending_extent_op *op;
+        struct extent_buffer *leaf;
+        int ret = 0;
+        struct list_head *cur = update_list->next;
+        u64 ref_objectid;
+        u64 ref_root = extent_root->root_key.objectid;
+        op = list_entry(cur, struct pending_extent_op, list);
+search:
+        key.objectid = op->bytenr;
+        key.type = BTRFS_EXTENT_REF_KEY;
+        key.offset = op->orig_parent;
+        ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+loop:
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        ref_objectid = btrfs_ref_objectid(leaf, ref);
+        if (btrfs_ref_root(leaf, ref) != ref_root ||
+            btrfs_ref_generation(leaf, ref) != op->orig_generation ||
+            (ref_objectid != op->level &&
+             ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+                printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
+                       "root %llu, owner %u\n",
+                       (unsigned long long)op->bytenr,
+                       (unsigned long long)op->orig_parent,
+                       (unsigned long long)ref_root, op->level);
+                btrfs_print_leaf(extent_root, leaf);
+                BUG();
+        }
+        key.objectid = op->bytenr;
+        key.offset = op->parent;
+        key.type = BTRFS_EXTENT_REF_KEY;
+        ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
+        BUG_ON(ret);
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        btrfs_set_ref_generation(leaf, ref, op->generation);
+        cur = cur->next;
+        list_del_init(&op->list);
+        unlock_extent(&info->extent_ins, op->bytenr,
+                      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+        kfree(op);
+        if (cur == update_list) {
+                btrfs_mark_buffer_dirty(path->nodes[0]);
+                btrfs_release_path(extent_root, path);
+                goto out;
+        }
+        op = list_entry(cur, struct pending_extent_op, list);
+        path->slots[0]++;
+        while (path->slots[0] < btrfs_header_nritems(leaf)) {
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid == op->bytenr &&
+                    key.type == BTRFS_EXTENT_REF_KEY)
+                        goto loop;
+                path->slots[0]++;
+        }
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(extent_root, path);
+        goto search;
+out:
+        return 0;
+}
+static noinline int insert_extents(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *extent_root,
+                                   struct btrfs_path *path,
+                                   struct list_head *insert_list, int nr)
+{
+        struct btrfs_key *keys;
+        u32 *data_size;
+        struct pending_extent_op *op;
+        struct extent_buffer *leaf;
+        struct list_head *cur = insert_list->next;
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        u64 ref_root = extent_root->root_key.objectid;
+        int i = 0, last = 0, ret;
+        int total = nr * 2;
+        if (!nr)
+                return 0;
+        keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
+        if (!keys)
+                return -ENOMEM;
+        data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
+        if (!data_size) {
+                kfree(keys);
+                return -ENOMEM;
+        }
+        list_for_each_entry(op, insert_list, list) {
+                keys[i].objectid = op->bytenr;
+                keys[i].offset = op->num_bytes;
+                keys[i].type = BTRFS_EXTENT_ITEM_KEY;
+                data_size[i] = sizeof(struct btrfs_extent_item);
+                i++;
+                keys[i].objectid = op->bytenr;
+                keys[i].offset = op->parent;
+                keys[i].type = BTRFS_EXTENT_REF_KEY;
+                data_size[i] = sizeof(struct btrfs_extent_ref);
+                i++;
+        }
+        op = list_entry(cur, struct pending_extent_op, list);
+        i = 0;
+        while (i < total) {
+                int c;
+                ret = btrfs_insert_some_items(trans, extent_root, path,
+                                              keys+i, data_size+i, total-i);
+                BUG_ON(ret < 0);
+                if (last && ret > 1)
+                        BUG();
+                leaf = path->nodes[0];
+                for (c = 0; c < ret; c++) {
+                        int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
+                        /*
+                         * if the first item we inserted was a backref, then
+                         * the EXTENT_ITEM will be the odd c's, else it will
+                         * be the even c's
+                         */
+                        if ((ref_first && (c % 2)) ||
+                            (!ref_first && !(c % 2))) {
+                                struct btrfs_extent_item *itm;
+                                itm = btrfs_item_ptr(leaf, path->slots[0] + c,
+                                                     struct btrfs_extent_item);
+                                btrfs_set_extent_refs(path->nodes[0], itm, 1);
+                                op->del++;
+                        } else {
+                                struct btrfs_extent_ref *ref;
+                                ref = btrfs_item_ptr(leaf, path->slots[0] + c,
+                                                     struct btrfs_extent_ref);
+                                btrfs_set_ref_root(leaf, ref, ref_root);
+                                btrfs_set_ref_generation(leaf, ref,
+                                                         op->generation);
+                                btrfs_set_ref_objectid(leaf, ref, op->level);
+                                btrfs_set_ref_num_refs(leaf, ref, 1);
+                                op->del++;
+                        }
+                        /*
+                         * using del to see when its ok to free up the
+                         * pending_extent_op.  In the case where we insert the
+                         * last item on the list in order to help do batching
+                         * we need to not free the extent op until we actually
+                         * insert the extent_item
+                         */
+                        if (op->del == 2) {
+                                unlock_extent(&info->extent_ins, op->bytenr,
+                                              op->bytenr + op->num_bytes - 1,
+                                              GFP_NOFS);
+                                cur = cur->next;
+                                list_del_init(&op->list);
+                                kfree(op);
+                                if (cur != insert_list)
+                                        op = list_entry(cur,
+                                                struct pending_extent_op,
+                                                list);
+                        }
+                }
+                btrfs_mark_buffer_dirty(leaf);
+                btrfs_release_path(extent_root, path);
+                /*
+                 * Ok backref's and items usually go right next to eachother,
+                 * but if we could only insert 1 item that means that we
+                 * inserted on the end of a leaf, and we have no idea what may
+                 * be on the next leaf so we just play it safe.  In order to
+                 * try and help this case we insert the last thing on our
+                 * insert list so hopefully it will end up being the last
+                 * thing on the leaf and everything else will be before it,
+                 * which will let us insert a whole bunch of items at the same
+                 * time.
+                 */
+                if (ret == 1 && !last && (i + ret < total)) {
+                        /*
+                         * last: where we will pick up the next time around
+                         * i: our current key to insert, will be total - 1
+                         * cur: the current op we are screwing with
+                         * op: duh
+                         */
+                        last = i + ret;
+                        i = total - 1;
+                        cur = insert_list->prev;
+                        op = list_entry(cur, struct pending_extent_op, list);
+                } else if (last) {
+                        /*
+                         * ok we successfully inserted the last item on the
+                         * list, lets reset everything
+                         *
+                         * i: our current key to insert, so where we left off
+                         *    last time
+                         * last: done with this
+                         * cur: the op we are messing with
+                         * op: duh
+                         * total: since we inserted the last key, we need to
+                         *        decrement total so we dont overflow
+                         */
+                        i = last;
+                        last = 0;
+                        total--;
+                        if (i < total) {
+                                cur = insert_list->next;
+                                op = list_entry(cur, struct pending_extent_op,
+                                                list);
+                        }
+                } else {
+                        i += ret;
+                }
+                cond_resched();
+        }
+        ret = 0;
+        kfree(keys);
+        kfree(data_size);
+        return ret;
+}
+static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 ref_root, u64 ref_generation,
+                                          u64 owner_objectid)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_ref *ref;
+        u32 num_refs;
+        int ret;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_REF_KEY;
+        key.offset = parent;
+        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+        if (ret == 0) {
+                leaf = path->nodes[0];
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_ref);
+                btrfs_set_ref_root(leaf, ref, ref_root);
+                btrfs_set_ref_generation(leaf, ref, ref_generation);
+                btrfs_set_ref_objectid(leaf, ref, owner_objectid);
+                btrfs_set_ref_num_refs(leaf, ref, 1);
+        } else if (ret == -EEXIST) {
+                u64 existing_owner;
+                BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
+                leaf = path->nodes[0];
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_ref);
+                if (btrfs_ref_root(leaf, ref) != ref_root ||
+                    btrfs_ref_generation(leaf, ref) != ref_generation) {
+                        ret = -EIO;
+                        WARN_ON(1);
+                        goto out;
+                }
+                num_refs = btrfs_ref_num_refs(leaf, ref);
+                BUG_ON(num_refs == 0);
+                btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+                existing_owner = btrfs_ref_objectid(leaf, ref);
+                if (existing_owner != owner_objectid &&
+                    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+                        btrfs_set_ref_objectid(leaf, ref,
+                                        BTRFS_MULTIPLE_OBJECTIDS);
+                }
+                ret = 0;
+        } else {
+                goto out;
+        }
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+        btrfs_release_path(root, path);
+        return ret;
+}
+static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_extent_ref *ref;
+        u32 num_refs;
+        int ret = 0;
+        leaf = path->nodes[0];
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        num_refs = btrfs_ref_num_refs(leaf, ref);
+        BUG_ON(num_refs == 0);
+        num_refs -= 1;
+        if (num_refs == 0) {
+                ret = btrfs_del_item(trans, root, path);
+        } else {
+                btrfs_set_ref_num_refs(leaf, ref, num_refs);
+                btrfs_mark_buffer_dirty(leaf);
+        }
+        btrfs_release_path(root, path);
+        return ret;
+}
+#ifdef BIO_RW_DISCARD
+static void btrfs_issue_discard(struct block_device *bdev,
+                                u64 start, u64 len)
+{
+        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+}
+#endif
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+                                u64 num_bytes)
+{
+#ifdef BIO_RW_DISCARD
+        int ret;
+        u64 map_length = num_bytes;
+        struct btrfs_multi_bio *multi = NULL;
+        /* Tell the block device(s) that the sectors can be discarded */
+        ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+                              bytenr, &map_length, &multi, 0);
+        if (!ret) {
+                struct btrfs_bio_stripe *stripe = multi->stripes;
+                int i;
+                if (map_length > num_bytes)
+                        map_length = num_bytes;
+                for (i = 0; i < multi->num_stripes; i++, stripe++) {
+                        btrfs_issue_discard(stripe->dev->bdev,
+                                            stripe->physical,
+                                            map_length);
+                }
+                kfree(multi);
+        }
+        return ret;
+#else
+        return 0;
+#endif
+}
+static noinline int free_extents(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *extent_root,
+                                 struct list_head *del_list)
+{
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct btrfs_path *path;
+        struct btrfs_key key, found_key;
+        struct extent_buffer *leaf;
+        struct list_head *cur;
+        struct pending_extent_op *op;
+        struct btrfs_extent_item *ei;
+        int ret, num_to_del, extent_slot = 0, found_extent = 0;
+        u32 refs;
+        u64 bytes_freed = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 1;
+search:
+        /* search for the backref for the current ref we want to delete */
+        cur = del_list->next;
+        op = list_entry(cur, struct pending_extent_op, list);
+        ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
+                                    op->orig_parent,
+                                    extent_root->root_key.objectid,
+                                    op->orig_generation, op->level, 1);
+        if (ret) {
+                printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
+                       "root %llu gen %llu owner %u\n",
+                       (unsigned long long)op->bytenr,
+                       (unsigned long long)extent_root->root_key.objectid,
+                       (unsigned long long)op->orig_generation, op->level);
+                btrfs_print_leaf(extent_root, path->nodes[0]);
+                WARN_ON(1);
+                goto out;
+        }
+        extent_slot = path->slots[0];
+        num_to_del = 1;
+        found_extent = 0;
+        /*
+         * if we aren't the first item on the leaf we can move back one and see
+         * if our ref is right next to our extent item
+         */
+        if (likely(extent_slot)) {
+                extent_slot--;
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      extent_slot);
+                if (found_key.objectid == op->bytenr &&
+                    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+                    found_key.offset == op->num_bytes) {
+                        num_to_del++;
+                        found_extent = 1;
+                }
+        }
+        /*
+         * if we didn't find the extent we need to delete the backref and then
+         * search for the extent item key so we can update its ref count
+         */
+        if (!found_extent) {
+                key.objectid = op->bytenr;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = op->num_bytes;
+                ret = remove_extent_backref(trans, extent_root, path);
+                BUG_ON(ret);
+                btrfs_release_path(extent_root, path);
+                ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
+                BUG_ON(ret);
+                extent_slot = path->slots[0];
+        }
+        /* this is where we update the ref count for the extent */
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        BUG_ON(refs == 0);
+        refs--;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        btrfs_mark_buffer_dirty(leaf);
+        /*
+         * This extent needs deleting.  The reason cur_slot is extent_slot +
+         * num_to_del is because extent_slot points to the slot where the extent
+         * is, and if the backref was not right next to the extent we will be
+         * deleting at least 1 item, and will want to start searching at the
+         * slot directly next to extent_slot.  However if we did find the
+         * backref next to the extent item them we will be deleting at least 2
+         * items and will want to start searching directly after the ref slot
+         */
+        if (!refs) {
+                struct list_head *pos, *n, *end;
+                int cur_slot = extent_slot+num_to_del;
+                u64 super_used;
+                u64 root_used;
+                path->slots[0] = extent_slot;
+                bytes_freed = op->num_bytes;
+                mutex_lock(&info->pinned_mutex);
+                ret = pin_down_bytes(trans, extent_root, op->bytenr,
+                                     op->num_bytes, op->level >=
+                                     BTRFS_FIRST_FREE_OBJECTID);
+                mutex_unlock(&info->pinned_mutex);
+                BUG_ON(ret < 0);
+                op->del = ret;
+                /*
+                 * we need to see if we can delete multiple things at once, so
+                 * start looping through the list of extents we are wanting to
+                 * delete and see if their extent/backref's are right next to
+                 * eachother and the extents only have 1 ref
+                 */
+                for (pos = cur->next; pos != del_list; pos = pos->next) {
+                        struct pending_extent_op *tmp;
+                        tmp = list_entry(pos, struct pending_extent_op, list);
+                        /* we only want to delete extent+ref at this stage */
+                        if (cur_slot >= btrfs_header_nritems(leaf) - 1)
+                                break;
+                        btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
+                        if (found_key.objectid != tmp->bytenr ||
+                            found_key.type != BTRFS_EXTENT_ITEM_KEY ||
+                            found_key.offset != tmp->num_bytes)
+                                break;
+                        /* check to make sure this extent only has one ref */
+                        ei = btrfs_item_ptr(leaf, cur_slot,
+                                            struct btrfs_extent_item);
+                        if (btrfs_extent_refs(leaf, ei) != 1)
+                                break;
+                        btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
+                        if (found_key.objectid != tmp->bytenr ||
+                            found_key.type != BTRFS_EXTENT_REF_KEY ||
+                            found_key.offset != tmp->orig_parent)
+                                break;
+                        /*
+                         * the ref is right next to the extent, we can set the
+                         * ref count to 0 since we will delete them both now
+                         */
+                        btrfs_set_extent_refs(leaf, ei, 0);
+                        /* pin down the bytes for this extent */
+                        mutex_lock(&info->pinned_mutex);
+                        ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
+                                             tmp->num_bytes, tmp->level >=
+                                             BTRFS_FIRST_FREE_OBJECTID);
+                        mutex_unlock(&info->pinned_mutex);
+                        BUG_ON(ret < 0);
+                        /*
+                         * use the del field to tell if we need to go ahead and
+                         * free up the extent when we delete the item or not.
+                         */
+                        tmp->del = ret;
+                        bytes_freed += tmp->num_bytes;
+                        num_to_del += 2;
+                        cur_slot += 2;
+                }
+                end = pos;
+                /* update the free space counters */
+                spin_lock(&info->delalloc_lock);
+                super_used = btrfs_super_bytes_used(&info->super_copy);
+                btrfs_set_super_bytes_used(&info->super_copy,
+                                           super_used - bytes_freed);
+                root_used = btrfs_root_used(&extent_root->root_item);
+                btrfs_set_root_used(&extent_root->root_item,
+                                    root_used - bytes_freed);
+                spin_unlock(&info->delalloc_lock);
+                /* delete the items */
+                ret = btrfs_del_items(trans, extent_root, path,
+                                      path->slots[0], num_to_del);
+                BUG_ON(ret);
+                /*
+                 * loop through the extents we deleted and do the cleanup work
+                 * on them
+                 */
+                for (pos = cur, n = pos->next; pos != end;
+                     pos = n, n = pos->next) {
+                        struct pending_extent_op *tmp;
+                        tmp = list_entry(pos, struct pending_extent_op, list);
+                        /*
+                         * remember tmp->del tells us wether or not we pinned
+                         * down the extent
+                         */
+                        ret = update_block_group(trans, extent_root,
+                                                 tmp->bytenr, tmp->num_bytes, 0,
+                                                 tmp->del);
+                        BUG_ON(ret);
+                        list_del_init(&tmp->list);
+                        unlock_extent(&info->extent_ins, tmp->bytenr,
+                                      tmp->bytenr + tmp->num_bytes - 1,
+                                      GFP_NOFS);
+                        kfree(tmp);
+                }
+        } else if (refs && found_extent) {
+                /*
+                 * the ref and extent were right next to eachother, but the
+                 * extent still has a ref, so just free the backref and keep
+                 * going
+                 */
+                ret = remove_extent_backref(trans, extent_root, path);
+                BUG_ON(ret);
+                list_del_init(&op->list);
+                unlock_extent(&info->extent_ins, op->bytenr,
+                              op->bytenr + op->num_bytes - 1, GFP_NOFS);
+                kfree(op);
+        } else {
+                /*
+                 * the extent has multiple refs and the backref we were looking
+                 * for was not right next to it, so just unlock and go next,
+                 * we're good to go
+                 */
+                list_del_init(&op->list);
+                unlock_extent(&info->extent_ins, op->bytenr,
+                              op->bytenr + op->num_bytes - 1, GFP_NOFS);
+                kfree(op);
+        }
+        btrfs_release_path(extent_root, path);
+        if (!list_empty(del_list))
+                goto search;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root, u64 bytenr,
+                                     u64 orig_parent, u64 parent,
+                                     u64 orig_root, u64 ref_root,
+                                     u64 orig_generation, u64 ref_generation,
+                                     u64 owner_objectid)
+{
+        int ret;
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        struct btrfs_path *path;
+        if (root == root->fs_info->extent_root) {
+                struct pending_extent_op *extent_op;
+                u64 num_bytes;
+                BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
+                num_bytes = btrfs_level_size(root, (int)owner_objectid);
+                mutex_lock(&root->fs_info->extent_ins_mutex);
+                if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+                                bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+                        u64 priv;
+                        ret = get_state_private(&root->fs_info->extent_ins,
+                                                bytenr, &priv);
+                        BUG_ON(ret);
+                        extent_op = (struct pending_extent_op *)
+                                                        (unsigned long)priv;
+                        BUG_ON(extent_op->parent != orig_parent);
+                        BUG_ON(extent_op->generation != orig_generation);
+                        extent_op->parent = parent;
+                        extent_op->generation = ref_generation;
+                } else {
+                        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                        BUG_ON(!extent_op);
+                        extent_op->type = PENDING_BACKREF_UPDATE;
+                        extent_op->bytenr = bytenr;
+                        extent_op->num_bytes = num_bytes;
+                        extent_op->parent = parent;
+                        extent_op->orig_parent = orig_parent;
+                        extent_op->generation = ref_generation;
+                        extent_op->orig_generation = orig_generation;
+                        extent_op->level = (int)owner_objectid;
+                        INIT_LIST_HEAD(&extent_op->list);
+                        extent_op->del = 0;
+                        set_extent_bits(&root->fs_info->extent_ins,
+                                        bytenr, bytenr + num_bytes - 1,
+                                        EXTENT_WRITEBACK, GFP_NOFS);
+                        set_state_private(&root->fs_info->extent_ins,
+                                          bytenr, (unsigned long)extent_op);
+                }
+                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                return 0;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = lookup_extent_backref(trans, extent_root, path,
+                                    bytenr, orig_parent, orig_root,
+                                    orig_generation, owner_objectid, 1);
+        if (ret)
+                goto out;
+        ret = remove_extent_backref(trans, extent_root, path);
+        if (ret)
+                goto out;
+        ret = insert_extent_backref(trans, extent_root, path, bytenr,
+                                    parent, ref_root, ref_generation,
+                                    owner_objectid);
+        BUG_ON(ret);
+        finish_current_insert(trans, extent_root, 0);
+        del_pending_extents(trans, extent_root, 0);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 orig_parent, u64 parent,
+                            u64 ref_root, u64 ref_generation,
+                            u64 owner_objectid)
+{
+        int ret;
+        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+                return 0;
+        ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
+                                        parent, ref_root, ref_root,
+                                        ref_generation, ref_generation,
+                                        owner_objectid);
+        return ret;
+}
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root, u64 bytenr,
+                                  u64 orig_parent, u64 parent,
+                                  u64 orig_root, u64 ref_root,
+                                  u64 orig_generation, u64 ref_generation,
+                                  u64 owner_objectid)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_key key;
+        struct extent_buffer *l;
+        struct btrfs_extent_item *item;
+        u32 refs;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 1;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = (u64)-1;
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+                                0, 1);
+        if (ret < 0)
+                return ret;
+        BUG_ON(ret == 0 || path->slots[0] == 0);
+        path->slots[0]--;
+        l = path->nodes[0];
+        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+        if (key.objectid != bytenr) {
+                btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+                printk(KERN_ERR "btrfs wanted %llu found %llu\n",
+                       (unsigned long long)bytenr,
+                       (unsigned long long)key.objectid);
+                BUG();
+        }
+        BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
+        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(l, item);
+        btrfs_set_extent_refs(l, item, refs + 1);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(root->fs_info->extent_root, path);
+        path->reada = 1;
+        ret = insert_extent_backref(trans, root->fs_info->extent_root,
+                                    path, bytenr, parent,
+                                    ref_root, ref_generation,
+                                    owner_objectid);
+        BUG_ON(ret);
+        finish_current_insert(trans, root->fs_info->extent_root, 0);
+        del_pending_extents(trans, root->fs_info->extent_root, 0);
+        btrfs_free_path(path);
+        return 0;
+}
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         u64 bytenr, u64 num_bytes, u64 parent,
+                         u64 ref_root, u64 ref_generation,
+                         u64 owner_objectid)
+{
+        int ret;
+        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+                return 0;
+        ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+                                     0, ref_root, 0, ref_generation,
+                                     owner_objectid);
+        return ret;
+}
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root)
+{
+        finish_current_insert(trans, root->fs_info->extent_root, 1);
+        del_pending_extents(trans, root->fs_info->extent_root, 1);
+        return 0;
+}
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 num_bytes, u32 *refs)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_key key;
+        struct extent_buffer *l;
+        struct btrfs_extent_item *item;
+        WARN_ON(num_bytes < root->sectorsize);
+        path = btrfs_alloc_path();
+        path->reada = 1;
+        key.objectid = bytenr;
+        key.offset = num_bytes;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+                                0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret != 0) {
+                btrfs_print_leaf(root, path->nodes[0]);
+                printk(KERN_INFO "btrfs failed to find block number %llu\n",
+                       (unsigned long long)bytenr);
+                BUG();
+        }
+        l = path->nodes[0];
+        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+        *refs = btrfs_extent_refs(l, item);
+out:
+        btrfs_free_path(path);
+        return 0;
+}
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 objectid, u64 bytenr)
+{
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_ref *ref_item;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        u64 ref_root;
+        u64 last_snapshot;
+        u32 nritems;
+        int ret;
+        key.objectid = bytenr;
+        key.offset = (u64)-1;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        path = btrfs_alloc_path();
+        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret == 0);
+        ret = -ENOENT;
+        if (path->slots[0] == 0)
+                goto out;
+        path->slots[0]--;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        if (found_key.objectid != bytenr ||
+            found_key.type != BTRFS_EXTENT_ITEM_KEY)
+                goto out;
+        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(extent_root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret == 0)
+                                continue;
+                        break;
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid != bytenr)
+                        break;
+                if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+                        path->slots[0]++;
+                        continue;
+                }
+                ref_item = btrfs_item_ptr(leaf, path->slots[0],
+                                          struct btrfs_extent_ref);
+                ref_root = btrfs_ref_root(leaf, ref_item);
+                if ((ref_root != root->root_key.objectid &&
+                     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
+                     objectid != btrfs_ref_objectid(leaf, ref_item)) {
+                        ret = 1;
+                        goto out;
+                }
+                if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
+                        ret = 1;
+                        goto out;
+                }
+                path->slots[0]++;
+        }
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                    struct extent_buffer *buf, u32 nr_extents)
+{
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        u64 root_gen;
+        u32 nritems;
+        int i;
+        int level;
+        int ret = 0;
+        int shared = 0;
+        if (!root->ref_cows)
+                return 0;
+        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+                shared = 0;
+                root_gen = root->root_key.offset;
+        } else {
+                shared = 1;
+                root_gen = trans->transid - 1;
+        }
+        level = btrfs_header_level(buf);
+        nritems = btrfs_header_nritems(buf);
+        if (level == 0) {
+                struct btrfs_leaf_ref *ref;
+                struct btrfs_extent_info *info;
+                ref = btrfs_alloc_leaf_ref(root, nr_extents);
+                if (!ref) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                ref->root_gen = root_gen;
+                ref->bytenr = buf->start;
+                ref->owner = btrfs_header_owner(buf);
+                ref->generation = btrfs_header_generation(buf);
+                ref->nritems = nr_extents;
+                info = ref->extents;
+                for (i = 0; nr_extents > 0 && i < nritems; i++) {
+                        u64 disk_bytenr;
+                        btrfs_item_key_to_cpu(buf, &key, i);
+                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        fi = btrfs_item_ptr(buf, i,
+                                            struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_type(buf, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE)
+                                continue;
+                        disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                        if (disk_bytenr == 0)
+                                continue;
+                        info->bytenr = disk_bytenr;
+                        info->num_bytes =
+                                btrfs_file_extent_disk_num_bytes(buf, fi);
+                        info->objectid = key.objectid;
+                        info->offset = key.offset;
+                        info++;
+                }
+                ret = btrfs_add_leaf_ref(root, ref, shared);
+                if (ret == -EEXIST && shared) {
+                        struct btrfs_leaf_ref *old;
+                        old = btrfs_lookup_leaf_ref(root, ref->bytenr);
+                        BUG_ON(!old);
+                        btrfs_remove_leaf_ref(root, old);
+                        btrfs_free_leaf_ref(root, old);
+                        ret = btrfs_add_leaf_ref(root, ref, shared);
+                }
+                WARN_ON(ret);
+                btrfs_free_leaf_ref(root, ref);
+        }
+out:
+        return ret;
+}
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+                  u32 *nr_extents)
+{
+        u64 bytenr;
+        u64 ref_root;
+        u64 orig_root;
+        u64 ref_generation;
+        u64 orig_generation;
+        u32 nritems;
+        u32 nr_file_extents = 0;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        int i;
+        int level;
+        int ret = 0;
+        int faili = 0;
+        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+                            u64, u64, u64, u64, u64, u64, u64, u64);
+        ref_root = btrfs_header_owner(buf);
+        ref_generation = btrfs_header_generation(buf);
+        orig_root = btrfs_header_owner(orig_buf);
+        orig_generation = btrfs_header_generation(orig_buf);
+        nritems = btrfs_header_nritems(buf);
+        level = btrfs_header_level(buf);
+        if (root->ref_cows) {
+                process_func = __btrfs_inc_extent_ref;
+        } else {
+                if (level == 0 &&
+                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+                        goto out;
+                if (level != 0 &&
+                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+                        goto out;
+                process_func = __btrfs_update_extent_ref;
+        }
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                if (level == 0) {
+                        btrfs_item_key_to_cpu(buf, &key, i);
+                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        fi = btrfs_item_ptr(buf, i,
+                                            struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_type(buf, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE)
+                                continue;
+                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                        if (bytenr == 0)
+                                continue;
+                        nr_file_extents++;
+                        ret = process_func(trans, root, bytenr,
+                                           orig_buf->start, buf->start,
+                                           orig_root, ref_root,
+                                           orig_generation, ref_generation,
+                                           key.objectid);
+                        if (ret) {
+                                faili = i;
+                                WARN_ON(1);
+                                goto fail;
+                        }
+                } else {
+                        bytenr = btrfs_node_blockptr(buf, i);
+                        ret = process_func(trans, root, bytenr,
+                                           orig_buf->start, buf->start,
+                                           orig_root, ref_root,
+                                           orig_generation, ref_generation,
+                                           level - 1);
+                        if (ret) {
+                                faili = i;
+                                WARN_ON(1);
+                                goto fail;
+                        }
+                }
+        }
+out:
+        if (nr_extents) {
+                if (level == 0)
+                        *nr_extents = nr_file_extents;
+                else
+                        *nr_extents = nritems;
+        }
+        return 0;
+fail:
+        WARN_ON(1);
+        return ret;
+}
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                     struct extent_buffer *buf, int start_slot, int nr)
+{
+        u64 bytenr;
+        u64 ref_root;
+        u64 orig_root;
+        u64 ref_generation;
+        u64 orig_generation;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        int i;
+        int ret;
+        int slot;
+        int level;
+        BUG_ON(start_slot < 0);
+        BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
+        ref_root = btrfs_header_owner(buf);
+        ref_generation = btrfs_header_generation(buf);
+        orig_root = btrfs_header_owner(orig_buf);
+        orig_generation = btrfs_header_generation(orig_buf);
+        level = btrfs_header_level(buf);
+        if (!root->ref_cows) {
+                if (level == 0 &&
+                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+                        return 0;
+                if (level != 0 &&
+                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+                        return 0;
+        }
+        for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+                cond_resched();
+                if (level == 0) {
+                        btrfs_item_key_to_cpu(buf, &key, slot);
+                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        fi = btrfs_item_ptr(buf, slot,
+                                            struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_type(buf, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE)
+                                continue;
+                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                        if (bytenr == 0)
+                                continue;
+                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
+                                            orig_buf->start, buf->start,
+                                            orig_root, ref_root,
+                                            orig_generation, ref_generation,
+                                            key.objectid);
+                        if (ret)
+                                goto fail;
+                } else {
+                        bytenr = btrfs_node_blockptr(buf, slot);
+                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
+                                            orig_buf->start, buf->start,
+                                            orig_root, ref_root,
+                                            orig_generation, ref_generation,
+                                            level - 1);
+                        if (ret)
+                                goto fail;
+                }
+        }
+        return 0;
+fail:
+        WARN_ON(1);
+        return -1;
+}
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_block_group_cache *cache)
+{
+        int ret;
+        int pending_ret;
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        unsigned long bi;
+        struct extent_buffer *leaf;
+        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+        if (ret < 0)
+                goto fail;
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(extent_root, path);
+fail:
+        finish_current_insert(trans, extent_root, 0);
+        pending_ret = del_pending_extents(trans, extent_root, 0);
+        if (ret)
+                return ret;
+        if (pending_ret)
+                return pending_ret;
+        return 0;
+}
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root)
+{
+        struct btrfs_block_group_cache *cache, *entry;
+        struct rb_node *n;
+        int err = 0;
+        int werr = 0;
+        struct btrfs_path *path;
+        u64 last = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        while (1) {
+                cache = NULL;
+                spin_lock(&root->fs_info->block_group_cache_lock);
+                for (n = rb_first(&root->fs_info->block_group_cache_tree);
+                     n; n = rb_next(n)) {
+                        entry = rb_entry(n, struct btrfs_block_group_cache,
+                                         cache_node);
+                        if (entry->dirty) {
+                                cache = entry;
+                                break;
+                        }
+                }
+                spin_unlock(&root->fs_info->block_group_cache_lock);
+                if (!cache)
+                        break;
+                cache->dirty = 0;
+                last += cache->key.offset;
+                err = write_one_cache_group(trans, root,
+                                            path, cache);
+                /*
+                 * if we fail to write the cache group, we want
+                 * to keep it marked dirty in hopes that a later
+                 * write will work
+                 */
+                if (err) {
+                        werr = err;
+                        continue;
+                }
+        }
+        btrfs_free_path(path);
+        return werr;
+}
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+        struct btrfs_block_group_cache *block_group;
+        int readonly = 0;
+        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+        if (!block_group || block_group->ro)
+                readonly = 1;
+        if (block_group)
+                put_block_group(block_group);
+        return readonly;
+}
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+                             u64 total_bytes, u64 bytes_used,
+                             struct btrfs_space_info **space_info)
+{
+        struct btrfs_space_info *found;
+        found = __find_space_info(info, flags);
+        if (found) {
+                spin_lock(&found->lock);
+                found->total_bytes += total_bytes;
+                found->bytes_used += bytes_used;
+                found->full = 0;
+                spin_unlock(&found->lock);
+                *space_info = found;
+                return 0;
+        }
+        found = kzalloc(sizeof(*found), GFP_NOFS);
+        if (!found)
+                return -ENOMEM;
+        list_add(&found->list, &info->space_info);
+        INIT_LIST_HEAD(&found->block_groups);
+        init_rwsem(&found->groups_sem);
+        spin_lock_init(&found->lock);
+        found->flags = flags;
+        found->total_bytes = total_bytes;
+        found->bytes_used = bytes_used;
+        found->bytes_pinned = 0;
+        found->bytes_reserved = 0;
+        found->bytes_readonly = 0;
+        found->full = 0;
+        found->force_alloc = 0;
+        *space_info = found;
+        return 0;
+}
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+        u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+                                   BTRFS_BLOCK_GROUP_RAID1 |
+                                   BTRFS_BLOCK_GROUP_RAID10 |
+                                   BTRFS_BLOCK_GROUP_DUP);
+        if (extra_flags) {
+                if (flags & BTRFS_BLOCK_GROUP_DATA)
+                        fs_info->avail_data_alloc_bits |= extra_flags;
+                if (flags & BTRFS_BLOCK_GROUP_METADATA)
+                        fs_info->avail_metadata_alloc_bits |= extra_flags;
+                if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+                        fs_info->avail_system_alloc_bits |= extra_flags;
+        }
+}
+static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
+{
+        spin_lock(&cache->space_info->lock);
+        spin_lock(&cache->lock);
+        if (!cache->ro) {
+                cache->space_info->bytes_readonly += cache->key.offset -
+                                        btrfs_block_group_used(&cache->item);
+                cache->ro = 1;
+        }
+        spin_unlock(&cache->lock);
+        spin_unlock(&cache->space_info->lock);
+}
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+{
+        u64 num_devices = root->fs_info->fs_devices->rw_devices;
+        if (num_devices == 1)
+                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+        if (num_devices < 4)
+                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+        if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+            (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                      BTRFS_BLOCK_GROUP_RAID10))) {
+                flags &= ~BTRFS_BLOCK_GROUP_DUP;
+        }
+        if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
+            (flags & BTRFS_BLOCK_GROUP_RAID10)) {
+                flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+        }
+        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+             (flags & BTRFS_BLOCK_GROUP_RAID10) |
+             (flags & BTRFS_BLOCK_GROUP_DUP)))
+                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+        return flags;
+}
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *extent_root, u64 alloc_bytes,
+                          u64 flags, int force)
+{
+        struct btrfs_space_info *space_info;
+        u64 thresh;
+        int ret = 0;
+        mutex_lock(&extent_root->fs_info->chunk_mutex);
+        flags = btrfs_reduce_alloc_profile(extent_root, flags);
+        space_info = __find_space_info(extent_root->fs_info, flags);
+        if (!space_info) {
+                ret = update_space_info(extent_root->fs_info, flags,
+                                        0, 0, &space_info);
+                BUG_ON(ret);
+        }
+        BUG_ON(!space_info);
+        spin_lock(&space_info->lock);
+        if (space_info->force_alloc) {
+                force = 1;
+                space_info->force_alloc = 0;
+        }
+        if (space_info->full) {
+                spin_unlock(&space_info->lock);
+                goto out;
+        }
+        thresh = space_info->total_bytes - space_info->bytes_readonly;
+        thresh = div_factor(thresh, 6);
+        if (!force &&
+           (space_info->bytes_used + space_info->bytes_pinned +
+            space_info->bytes_reserved + alloc_bytes) < thresh) {
+                spin_unlock(&space_info->lock);
+                goto out;
+        }
+        spin_unlock(&space_info->lock);
+        ret = btrfs_alloc_chunk(trans, extent_root, flags);
+        if (ret)
+                space_info->full = 1;
+out:
+        mutex_unlock(&extent_root->fs_info->chunk_mutex);
+        return ret;
+}
+static int update_block_group(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              u64 bytenr, u64 num_bytes, int alloc,
+                              int mark_free)
+{
+        struct btrfs_block_group_cache *cache;
+        struct btrfs_fs_info *info = root->fs_info;
+        u64 total = num_bytes;
+        u64 old_val;
+        u64 byte_in_group;
+        while (total) {
+                cache = btrfs_lookup_block_group(info, bytenr);
+                if (!cache)
+                        return -1;
+                byte_in_group = bytenr - cache->key.objectid;
+                WARN_ON(byte_in_group > cache->key.offset);
+                spin_lock(&cache->space_info->lock);
+                spin_lock(&cache->lock);
+                cache->dirty = 1;
+                old_val = btrfs_block_group_used(&cache->item);
+                num_bytes = min(total, cache->key.offset - byte_in_group);
+                if (alloc) {
+                        old_val += num_bytes;
+                        cache->space_info->bytes_used += num_bytes;
+                        if (cache->ro)
+                                cache->space_info->bytes_readonly -= num_bytes;
+                        btrfs_set_block_group_used(&cache->item, old_val);
+                        spin_unlock(&cache->lock);
+                        spin_unlock(&cache->space_info->lock);
+                } else {
+                        old_val -= num_bytes;
+                        cache->space_info->bytes_used -= num_bytes;
+                        if (cache->ro)
+                                cache->space_info->bytes_readonly += num_bytes;
+                        btrfs_set_block_group_used(&cache->item, old_val);
+                        spin_unlock(&cache->lock);
+                        spin_unlock(&cache->space_info->lock);
+                        if (mark_free) {
+                                int ret;
+                                ret = btrfs_discard_extent(root, bytenr,
+                                                           num_bytes);
+                                WARN_ON(ret);
+                                ret = btrfs_add_free_space(cache, bytenr,
+                                                           num_bytes);
+                                WARN_ON(ret);
+                        }
+                }
+                put_block_group(cache);
+                total -= num_bytes;
+                bytenr += num_bytes;
+        }
+        return 0;
+}
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+        struct btrfs_block_group_cache *cache;
+        u64 bytenr;
+        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+        if (!cache)
+                return 0;
+        bytenr = cache->key.objectid;
+        put_block_group(cache);
+        return bytenr;
+}
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+                                u64 bytenr, u64 num, int pin)
+{
+        u64 len;
+        struct btrfs_block_group_cache *cache;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
+        if (pin) {
+                set_extent_dirty(&fs_info->pinned_extents,
+                                bytenr, bytenr + num - 1, GFP_NOFS);
+        } else {
+                clear_extent_dirty(&fs_info->pinned_extents,
+                                bytenr, bytenr + num - 1, GFP_NOFS);
+        }
+        while (num > 0) {
+                cache = btrfs_lookup_block_group(fs_info, bytenr);
+                BUG_ON(!cache);
+                len = min(num, cache->key.offset -
+                          (bytenr - cache->key.objectid));
+                if (pin) {
+                        spin_lock(&cache->space_info->lock);
+                        spin_lock(&cache->lock);
+                        cache->pinned += len;
+                        cache->space_info->bytes_pinned += len;
+                        spin_unlock(&cache->lock);
+                        spin_unlock(&cache->space_info->lock);
+                        fs_info->total_pinned += len;
+                } else {
+                        spin_lock(&cache->space_info->lock);
+                        spin_lock(&cache->lock);
+                        cache->pinned -= len;
+                        cache->space_info->bytes_pinned -= len;
+                        spin_unlock(&cache->lock);
+                        spin_unlock(&cache->space_info->lock);
+                        fs_info->total_pinned -= len;
+                        if (cache->cached)
+                                btrfs_add_free_space(cache, bytenr, len);
+                }
+                put_block_group(cache);
+                bytenr += len;
+                num -= len;
+        }
+        return 0;
+}
+static int update_reserved_extents(struct btrfs_root *root,
+                                   u64 bytenr, u64 num, int reserve)
+{
+        u64 len;
+        struct btrfs_block_group_cache *cache;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        while (num > 0) {
+                cache = btrfs_lookup_block_group(fs_info, bytenr);
+                BUG_ON(!cache);
+                len = min(num, cache->key.offset -
+                          (bytenr - cache->key.objectid));
+                spin_lock(&cache->space_info->lock);
+                spin_lock(&cache->lock);
+                if (reserve) {
+                        cache->reserved += len;
+                        cache->space_info->bytes_reserved += len;
+                } else {
+                        cache->reserved -= len;
+                        cache->space_info->bytes_reserved -= len;
+                }
+                spin_unlock(&cache->lock);
+                spin_unlock(&cache->space_info->lock);
+                put_block_group(cache);
+                bytenr += len;
+                num -= len;
+        }
+        return 0;
+}
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
+{
+        u64 last = 0;
+        u64 start;
+        u64 end;
+        struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
+        int ret;
+        mutex_lock(&root->fs_info->pinned_mutex);
+        while (1) {
+                ret = find_first_extent_bit(pinned_extents, last,
+                                            &start, &end, EXTENT_DIRTY);
+                if (ret)
+                        break;
+                set_extent_dirty(copy, start, end, GFP_NOFS);
+                last = end + 1;
+        }
+        mutex_unlock(&root->fs_info->pinned_mutex);
+        return 0;
+}
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct extent_io_tree *unpin)
+{
+        u64 start;
+        u64 end;
+        int ret;
+        mutex_lock(&root->fs_info->pinned_mutex);
+        while (1) {
+                ret = find_first_extent_bit(unpin, 0, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                ret = btrfs_discard_extent(root, start, end + 1 - start);
+                btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
+                clear_extent_dirty(unpin, start, end, GFP_NOFS);
+                if (need_resched()) {
+                        mutex_unlock(&root->fs_info->pinned_mutex);
+                        cond_resched();
+                        mutex_lock(&root->fs_info->pinned_mutex);
+                }
+        }
+        mutex_unlock(&root->fs_info->pinned_mutex);
+        return ret;
+}
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *extent_root, int all)
+{
+        u64 start;
+        u64 end;
+        u64 priv;
+        u64 search = 0;
+        u64 skipped = 0;
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct btrfs_path *path;
+        struct pending_extent_op *extent_op, *tmp;
+        struct list_head insert_list, update_list;
+        int ret;
+        int num_inserts = 0, max_inserts;
+        path = btrfs_alloc_path();
+        INIT_LIST_HEAD(&insert_list);
+        INIT_LIST_HEAD(&update_list);
+        max_inserts = extent_root->leafsize /
+                (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
+                 sizeof(struct btrfs_extent_ref) +
+                 sizeof(struct btrfs_extent_item));
+again:
+        mutex_lock(&info->extent_ins_mutex);
+        while (1) {
+                ret = find_first_extent_bit(&info->extent_ins, search, &start,
+                                            &end, EXTENT_WRITEBACK);
+                if (ret) {
+                        if (skipped && all && !num_inserts) {
+                                skipped = 0;
+                                search = 0;
+                                continue;
+                        }
+                        mutex_unlock(&info->extent_ins_mutex);
+                        break;
+                }
+                ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
+                if (!ret) {
+                        skipped = 1;
+                        search = end + 1;
+                        if (need_resched()) {
+                                mutex_unlock(&info->extent_ins_mutex);
+                                cond_resched();
+                                mutex_lock(&info->extent_ins_mutex);
+                        }
+                        continue;
+                }
+                ret = get_state_private(&info->extent_ins, start, &priv);
+                BUG_ON(ret);
+                extent_op = (struct pending_extent_op *)(unsigned long) priv;
+                if (extent_op->type == PENDING_EXTENT_INSERT) {
+                        num_inserts++;
+                        list_add_tail(&extent_op->list, &insert_list);
+                        search = end + 1;
+                        if (num_inserts == max_inserts) {
+                                mutex_unlock(&info->extent_ins_mutex);
+                                break;
+                        }
+                } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
+                        list_add_tail(&extent_op->list, &update_list);
+                        search = end + 1;
+                } else {
+                        BUG();
+                }
+        }
+        /*
+         * process the update list, clear the writeback bit for it, and if
+         * somebody marked this thing for deletion then just unlock it and be
+         * done, the free_extents will handle it
+         */
+        mutex_lock(&info->extent_ins_mutex);
+        list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
+                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+                                  extent_op->bytenr + extent_op->num_bytes - 1,
+                                  EXTENT_WRITEBACK, GFP_NOFS);
+                if (extent_op->del) {
+                        list_del_init(&extent_op->list);
+                        unlock_extent(&info->extent_ins, extent_op->bytenr,
+                                      extent_op->bytenr + extent_op->num_bytes
+                                      - 1, GFP_NOFS);
+                        kfree(extent_op);
+                }
+        }
+        mutex_unlock(&info->extent_ins_mutex);
+        /*
+         * still have things left on the update list, go ahead an update
+         * everything
+         */
+        if (!list_empty(&update_list)) {
+                ret = update_backrefs(trans, extent_root, path, &update_list);
+                BUG_ON(ret);
+        }
+        /*
+         * if no inserts need to be done, but we skipped some extents and we
+         * need to make sure everything is cleaned then reset everything and
+         * go back to the beginning
+         */
+        if (!num_inserts && all && skipped) {
+                search = 0;
+                skipped = 0;
+                INIT_LIST_HEAD(&update_list);
+                INIT_LIST_HEAD(&insert_list);
+                goto again;
+        } else if (!num_inserts) {
+                goto out;
+        }
+        /*
+         * process the insert extents list.  Again if we are deleting this
+         * extent, then just unlock it, pin down the bytes if need be, and be
+         * done with it.  Saves us from having to actually insert the extent
+         * into the tree and then subsequently come along and delete it
+         */
+        mutex_lock(&info->extent_ins_mutex);
+        list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
+                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+                                  extent_op->bytenr + extent_op->num_bytes - 1,
+                                  EXTENT_WRITEBACK, GFP_NOFS);
+                if (extent_op->del) {
+                        u64 used;
+                        list_del_init(&extent_op->list);
+                        unlock_extent(&info->extent_ins, extent_op->bytenr,
+                                      extent_op->bytenr + extent_op->num_bytes
+                                      - 1, GFP_NOFS);
+                        mutex_lock(&extent_root->fs_info->pinned_mutex);
+                        ret = pin_down_bytes(trans, extent_root,
+                                             extent_op->bytenr,
+                                             extent_op->num_bytes, 0);
+                        mutex_unlock(&extent_root->fs_info->pinned_mutex);
+                        spin_lock(&info->delalloc_lock);
+                        used = btrfs_super_bytes_used(&info->super_copy);
+                        btrfs_set_super_bytes_used(&info->super_copy,
+                                        used - extent_op->num_bytes);
+                        used = btrfs_root_used(&extent_root->root_item);
+                        btrfs_set_root_used(&extent_root->root_item,
+                                        used - extent_op->num_bytes);
+                        spin_unlock(&info->delalloc_lock);
+                        ret = update_block_group(trans, extent_root,
+                                                 extent_op->bytenr,
+                                                 extent_op->num_bytes,
+                                                 0, ret > 0);
+                        BUG_ON(ret);
+                        kfree(extent_op);
+                        num_inserts--;
+                }
+        }
+        mutex_unlock(&info->extent_ins_mutex);
+        ret = insert_extents(trans, extent_root, path, &insert_list,
+                             num_inserts);
+        BUG_ON(ret);
+        /*
+         * if we broke out of the loop in order to insert stuff because we hit
+         * the maximum number of inserts at a time we can handle, then loop
+         * back and pick up where we left off
+         */
+        if (num_inserts == max_inserts) {
+                INIT_LIST_HEAD(&insert_list);
+                INIT_LIST_HEAD(&update_list);
+                num_inserts = 0;
+                goto again;
+        }
+        /*
+         * again, if we need to make absolutely sure there are no more pending
+         * extent operations left and we know that we skipped some, go back to
+         * the beginning and do it all again
+         */
+        if (all && skipped) {
+                INIT_LIST_HEAD(&insert_list);
+                INIT_LIST_HEAD(&update_list);
+                search = 0;
+                skipped = 0;
+                num_inserts = 0;
+                goto again;
+        }
+out:
+        btrfs_free_path(path);
+        return 0;
+}
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          u64 bytenr, u64 num_bytes, int is_data)
+{
+        int err = 0;
+        struct extent_buffer *buf;
+        if (is_data)
+                goto pinit;
+        buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+        if (!buf)
+                goto pinit;
+        /* we can reuse a block if it hasn't been written
+         * and it is from this transaction.  We can't
+         * reuse anything from the tree log root because
+         * it has tiny sub-transactions.
+         */
+        if (btrfs_buffer_uptodate(buf, 0) &&
+            btrfs_try_tree_lock(buf)) {
+                u64 header_owner = btrfs_header_owner(buf);
+                u64 header_transid = btrfs_header_generation(buf);
+                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+                    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+                    header_transid == trans->transid &&
+                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+                        clean_tree_block(NULL, root, buf);
+                        btrfs_tree_unlock(buf);
+                        free_extent_buffer(buf);
+                        return 1;
+                }
+                btrfs_tree_unlock(buf);
+        }
+        free_extent_buffer(buf);
+pinit:
+        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+        BUG_ON(err < 0);
+        return 0;
+}
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __free_extent(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         u64 bytenr, u64 num_bytes, u64 parent,
+                         u64 root_objectid, u64 ref_generation,
+                         u64 owner_objectid, int pin, int mark_free)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_fs_info *info = root->fs_info;
+        struct btrfs_root *extent_root = info->extent_root;
+        struct extent_buffer *leaf;
+        int ret;
+        int extent_slot = 0;
+        int found_extent = 0;
+        int num_to_del = 1;
+        struct btrfs_extent_item *ei;
+        u32 refs;
+        key.objectid = bytenr;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+        key.offset = num_bytes;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 1;
+        ret = lookup_extent_backref(trans, extent_root, path,
+                                    bytenr, parent, root_objectid,
+                                    ref_generation, owner_objectid, 1);
+        if (ret == 0) {
+                struct btrfs_key found_key;
+                extent_slot = path->slots[0];
+                while (extent_slot > 0) {
+                        extent_slot--;
+                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                              extent_slot);
+                        if (found_key.objectid != bytenr)
+                                break;
+                        if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+                            found_key.offset == num_bytes) {
+                                found_extent = 1;
+                                break;
+                        }
+                        if (path->slots[0] - extent_slot > 5)
+                                break;
+                }
+                if (!found_extent) {
+                        ret = remove_extent_backref(trans, extent_root, path);
+                        BUG_ON(ret);
+                        btrfs_release_path(extent_root, path);
+                        ret = btrfs_search_slot(trans, extent_root,
+                                                &key, path, -1, 1);
+                        if (ret) {
+                                printk(KERN_ERR "umm, got %d back from search"
+                                       ", was looking for %llu\n", ret,
+                                       (unsigned long long)bytenr);
+                                btrfs_print_leaf(extent_root, path->nodes[0]);
+                        }
+                        BUG_ON(ret);
+                        extent_slot = path->slots[0];
+                }
+        } else {
+                btrfs_print_leaf(extent_root, path->nodes[0]);
+                WARN_ON(1);
+                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
+                       "root %llu gen %llu owner %llu\n",
+                       (unsigned long long)bytenr,
+                       (unsigned long long)root_objectid,
+                       (unsigned long long)ref_generation,
+                       (unsigned long long)owner_objectid);
+        }
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, extent_slot,
+                            struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        BUG_ON(refs == 0);
+        refs -= 1;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        btrfs_mark_buffer_dirty(leaf);
+        if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+                struct btrfs_extent_ref *ref;
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_ref);
+                BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+                /* if the back ref and the extent are next to each other
+                 * they get deleted below in one shot
+                 */
+                path->slots[0] = extent_slot;
+                num_to_del = 2;
+        } else if (found_extent) {
+                /* otherwise delete the extent back ref */
+                ret = remove_extent_backref(trans, extent_root, path);
+                BUG_ON(ret);
+                /* if refs are 0, we need to setup the path for deletion */
+                if (refs == 0) {
+                        btrfs_release_path(extent_root, path);
+                        ret = btrfs_search_slot(trans, extent_root, &key, path,
+                                                -1, 1);
+                        BUG_ON(ret);
+                }
+        }
+        if (refs == 0) {
+                u64 super_used;
+                u64 root_used;
+                if (pin) {
+                        mutex_lock(&root->fs_info->pinned_mutex);
+                        ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+                                owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+                        mutex_unlock(&root->fs_info->pinned_mutex);
+                        if (ret > 0)
+                                mark_free = 1;
+                        BUG_ON(ret < 0);
+                }
+                /* block accounting for super block */
+                spin_lock(&info->delalloc_lock);
+                super_used = btrfs_super_bytes_used(&info->super_copy);
+                btrfs_set_super_bytes_used(&info->super_copy,
+                                           super_used - num_bytes);
+                /* block accounting for root item */
+                root_used = btrfs_root_used(&root->root_item);
+                btrfs_set_root_used(&root->root_item,
+                                           root_used - num_bytes);
+                spin_unlock(&info->delalloc_lock);
+                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+                                      num_to_del);
+                BUG_ON(ret);
+                btrfs_release_path(extent_root, path);
+                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+                        BUG_ON(ret);
+                }
+                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+                                         mark_free);
+                BUG_ON(ret);
+        }
+        btrfs_free_path(path);
+        finish_current_insert(trans, extent_root, 0);
+        return ret;
+}
+/*
+ * find all the blocks marked as pending in the radix tree and remove
+ * them from the extent map
+ */
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root, int all)
+{
+        int ret;
+        int err = 0;
+        u64 start;
+        u64 end;
+        u64 priv;
+        u64 search = 0;
+        int nr = 0, skipped = 0;
+        struct extent_io_tree *pending_del;
+        struct extent_io_tree *extent_ins;
+        struct pending_extent_op *extent_op;
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct list_head delete_list;
+        INIT_LIST_HEAD(&delete_list);
+        extent_ins = &extent_root->fs_info->extent_ins;
+        pending_del = &extent_root->fs_info->pending_del;
+again:
+        mutex_lock(&info->extent_ins_mutex);
+        while (1) {
+                ret = find_first_extent_bit(pending_del, search, &start, &end,
+                                            EXTENT_WRITEBACK);
+                if (ret) {
+                        if (all && skipped && !nr) {
+                                search = 0;
+                                continue;
+                        }
+                        mutex_unlock(&info->extent_ins_mutex);
+                        break;
+                }
+                ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
+                if (!ret) {
+                        search = end+1;
+                        skipped = 1;
+                        if (need_resched()) {
+                                mutex_unlock(&info->extent_ins_mutex);
+                                cond_resched();
+                                mutex_lock(&info->extent_ins_mutex);
+                        }
+                        continue;
+                }
+                BUG_ON(ret < 0);
+                ret = get_state_private(pending_del, start, &priv);
+                BUG_ON(ret);
+                extent_op = (struct pending_extent_op *)(unsigned long)priv;
+                clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
+                                  GFP_NOFS);
+                if (!test_range_bit(extent_ins, start, end,
+                                    EXTENT_WRITEBACK, 0)) {
+                        list_add_tail(&extent_op->list, &delete_list);
+                        nr++;
+                } else {
+                        kfree(extent_op);
+                        ret = get_state_private(&info->extent_ins, start,
+                                                &priv);
+                        BUG_ON(ret);
+                        extent_op = (struct pending_extent_op *)
+                                                (unsigned long)priv;
+                        clear_extent_bits(&info->extent_ins, start, end,
+                                          EXTENT_WRITEBACK, GFP_NOFS);
+                        if (extent_op->type == PENDING_BACKREF_UPDATE) {
+                                list_add_tail(&extent_op->list, &delete_list);
+                                search = end + 1;
+                                nr++;
+                                continue;
+                        }
+                        mutex_lock(&extent_root->fs_info->pinned_mutex);
+                        ret = pin_down_bytes(trans, extent_root, start,
+                                             end + 1 - start, 0);
+                        mutex_unlock(&extent_root->fs_info->pinned_mutex);
+                        ret = update_block_group(trans, extent_root, start,
+                                                end + 1 - start, 0, ret > 0);
+                        unlock_extent(extent_ins, start, end, GFP_NOFS);
+                        BUG_ON(ret);
+                        kfree(extent_op);
+                }
+                if (ret)
+                        err = ret;
+                search = end + 1;
+                if (need_resched()) {
+                        mutex_unlock(&info->extent_ins_mutex);
+                        cond_resched();
+                        mutex_lock(&info->extent_ins_mutex);
+                }
+        }
+        if (nr) {
+                ret = free_extents(trans, extent_root, &delete_list);
+                BUG_ON(ret);
+        }
+        if (all && skipped) {
+                INIT_LIST_HEAD(&delete_list);
+                search = 0;
+                nr = 0;
+                goto again;
+        }
+        return err;
+}
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               u64 bytenr, u64 num_bytes, u64 parent,
+                               u64 root_objectid, u64 ref_generation,
+                               u64 owner_objectid, int pin)
+{
+        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        int pending_ret;
+        int ret;
+        WARN_ON(num_bytes < root->sectorsize);
+        if (root == extent_root) {
+                struct pending_extent_op *extent_op = NULL;
+                mutex_lock(&root->fs_info->extent_ins_mutex);
+                if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+                                bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+                        u64 priv;
+                        ret = get_state_private(&root->fs_info->extent_ins,
+                                                bytenr, &priv);
+                        BUG_ON(ret);
+                        extent_op = (struct pending_extent_op *)
+                                                (unsigned long)priv;
+                        extent_op->del = 1;
+                        if (extent_op->type == PENDING_EXTENT_INSERT) {
+                                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                                return 0;
+                        }
+                }
+                if (extent_op) {
+                        ref_generation = extent_op->orig_generation;
+                        parent = extent_op->orig_parent;
+                }
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                BUG_ON(!extent_op);
+                extent_op->type = PENDING_EXTENT_DELETE;
+                extent_op->bytenr = bytenr;
+                extent_op->num_bytes = num_bytes;
+                extent_op->parent = parent;
+                extent_op->orig_parent = parent;
+                extent_op->generation = ref_generation;
+                extent_op->orig_generation = ref_generation;
+                extent_op->level = (int)owner_objectid;
+                INIT_LIST_HEAD(&extent_op->list);
+                extent_op->del = 0;
+                set_extent_bits(&root->fs_info->pending_del,
+                                bytenr, bytenr + num_bytes - 1,
+                                EXTENT_WRITEBACK, GFP_NOFS);
+                set_state_private(&root->fs_info->pending_del,
+                                  bytenr, (unsigned long)extent_op);
+                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                return 0;
+        }
+        /* if metadata always pin */
+        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+                        struct btrfs_block_group_cache *cache;
+                        /* btrfs_free_reserved_extent */
+                        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+                        BUG_ON(!cache);
+                        btrfs_add_free_space(cache, bytenr, num_bytes);
+                        put_block_group(cache);
+                        update_reserved_extents(root, bytenr, num_bytes, 0);
+                        return 0;
+                }
+                pin = 1;
+        }
+        /* if data pin when any transaction has committed this */
+        if (ref_generation != trans->transid)
+                pin = 1;
+        ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+                            root_objectid, ref_generation,
+                            owner_objectid, pin, pin == 0);
+        finish_current_insert(trans, root->fs_info->extent_root, 0);
+        pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
+        return ret ? ret : pending_ret;
+}
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root,
+                      u64 bytenr, u64 num_bytes, u64 parent,
+                      u64 root_objectid, u64 ref_generation,
+                      u64 owner_objectid, int pin)
+{
+        int ret;
+        ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
+                                  root_objectid, ref_generation,
+                                  owner_objectid, pin);
+        return ret;
+}
+static u64 stripe_align(struct btrfs_root *root, u64 val)
+{
+        u64 mask = ((u64)root->stripesize - 1);
+        u64 ret = (val + mask) & ~mask;
+        return ret;
+}
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == block start
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
+ * ins->offset == number of blocks
+ * Any available blocks before search_start are skipped.
+ */
+static noinline int find_free_extent(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *orig_root,
+                                     u64 num_bytes, u64 empty_size,
+                                     u64 search_start, u64 search_end,
+                                     u64 hint_byte, struct btrfs_key *ins,
+                                     u64 exclude_start, u64 exclude_nr,
+                                     int data)
+{
+        int ret = 0;
+        struct btrfs_root *root = orig_root->fs_info->extent_root;
+        u64 total_needed = num_bytes;
+        u64 *last_ptr = NULL;
+        u64 last_wanted = 0;
+        struct btrfs_block_group_cache *block_group = NULL;
+        int chunk_alloc_done = 0;
+        int empty_cluster = 2 * 1024 * 1024;
+        int allowed_chunk_alloc = 0;
+        struct list_head *head = NULL, *cur = NULL;
+        int loop = 0;
+        int extra_loop = 0;
+        struct btrfs_space_info *space_info;
+        WARN_ON(num_bytes < root->sectorsize);
+        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+        ins->objectid = 0;
+        ins->offset = 0;
+        if (orig_root->ref_cows || empty_size)
+                allowed_chunk_alloc = 1;
+        if (data & BTRFS_BLOCK_GROUP_METADATA) {
+                last_ptr = &root->fs_info->last_alloc;
+                empty_cluster = 64 * 1024;
+        }
+        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
+                last_ptr = &root->fs_info->last_data_alloc;
+        if (last_ptr) {
+                if (*last_ptr) {
+                        hint_byte = *last_ptr;
+                        last_wanted = *last_ptr;
+                } else
+                        empty_size += empty_cluster;
+        } else {
+                empty_cluster = 0;
+        }
+        search_start = max(search_start, first_logical_byte(root, 0));
+        search_start = max(search_start, hint_byte);
+        if (last_wanted && search_start != last_wanted) {
+                last_wanted = 0;
+                empty_size += empty_cluster;
+        }
+        total_needed += empty_size;
+        block_group = btrfs_lookup_block_group(root->fs_info, search_start);
+        if (!block_group)
+                block_group = btrfs_lookup_first_block_group(root->fs_info,
+                                                             search_start);
+        space_info = __find_space_info(root->fs_info, data);
+        down_read(&space_info->groups_sem);
+        while (1) {
+                struct btrfs_free_space *free_space;
+                /*
+                 * the only way this happens if our hint points to a block
+                 * group thats not of the proper type, while looping this
+                 * should never happen
+                 */
+                if (empty_size)
+                        extra_loop = 1;
+                if (!block_group)
+                        goto new_group_no_lock;
+                if (unlikely(!block_group->cached)) {
+                        mutex_lock(&block_group->cache_mutex);
+                        ret = cache_block_group(root, block_group);
+                        mutex_unlock(&block_group->cache_mutex);
+                        if (ret)
+                                break;
+                }
+                mutex_lock(&block_group->alloc_mutex);
+                if (unlikely(!block_group_bits(block_group, data)))
+                        goto new_group;
+                if (unlikely(block_group->ro))
+                        goto new_group;
+                free_space = btrfs_find_free_space(block_group, search_start,
+                                                   total_needed);
+                if (free_space) {
+                        u64 start = block_group->key.objectid;
+                        u64 end = block_group->key.objectid +
+                                block_group->key.offset;
+                        search_start = stripe_align(root, free_space->offset);
+                        /* move on to the next group */
+                        if (search_start + num_bytes >= search_end)
+                                goto new_group;
+                        /* move on to the next group */
+                        if (search_start + num_bytes > end)
+                                goto new_group;
+                        if (last_wanted && search_start != last_wanted) {
+                                total_needed += empty_cluster;
+                                empty_size += empty_cluster;
+                                last_wanted = 0;
+                                /*
+                                 * if search_start is still in this block group
+                                 * then we just re-search this block group
+                                 */
+                                if (search_start >= start &&
+                                    search_start < end) {
+                                        mutex_unlock(&block_group->alloc_mutex);
+                                        continue;
+                                }
+                                /* else we go to the next block group */
+                                goto new_group;
+                        }
+                        if (exclude_nr > 0 &&
+                            (search_start + num_bytes > exclude_start &&
+                             search_start < exclude_start + exclude_nr)) {
+                                search_start = exclude_start + exclude_nr;
+                                /*
+                                 * if search_start is still in this block group
+                                 * then we just re-search this block group
+                                 */
+                                if (search_start >= start &&
+                                    search_start < end) {
+                                        mutex_unlock(&block_group->alloc_mutex);
+                                        last_wanted = 0;
+                                        continue;
+                                }
+                                /* else we go to the next block group */
+                                goto new_group;
+                        }
+                        ins->objectid = search_start;
+                        ins->offset = num_bytes;
+                        btrfs_remove_free_space_lock(block_group, search_start,
+                                                     num_bytes);
+                        /* we are all good, lets return */
+                        mutex_unlock(&block_group->alloc_mutex);
+                        break;
+                }
+new_group:
+                mutex_unlock(&block_group->alloc_mutex);
+                put_block_group(block_group);
+                block_group = NULL;
+new_group_no_lock:
+                /* don't try to compare new allocations against the
+                 * last allocation any more
+                 */
+                last_wanted = 0;
+                /*
+                 * Here's how this works.
+                 * loop == 0: we were searching a block group via a hint
+                 *              and didn't find anything, so we start at
+                 *              the head of the block groups and keep searching
+                 * loop == 1: we're searching through all of the block groups
+                 *              if we hit the head again we have searched
+                 *              all of the block groups for this space and we
+                 *              need to try and allocate, if we cant error out.
+                 * loop == 2: we allocated more space and are looping through
+                 *              all of the block groups again.
+                 */
+                if (loop == 0) {
+                        head = &space_info->block_groups;
+                        cur = head->next;
+                        loop++;
+                } else if (loop == 1 && cur == head) {
+                        int keep_going;
+                        /* at this point we give up on the empty_size
+                         * allocations and just try to allocate the min
+                         * space.
+                         *
+                         * The extra_loop field was set if an empty_size
+                         * allocation was attempted above, and if this
+                         * is try we need to try the loop again without
+                         * the additional empty_size.
+                         */
+                        total_needed -= empty_size;
+                        empty_size = 0;
+                        keep_going = extra_loop;
+                        loop++;
+                        if (allowed_chunk_alloc && !chunk_alloc_done) {
+                                up_read(&space_info->groups_sem);
+                                ret = do_chunk_alloc(trans, root, num_bytes +
+                                                     2 * 1024 * 1024, data, 1);
+                                down_read(&space_info->groups_sem);
+                                if (ret < 0)
+                                        goto loop_check;
+                                head = &space_info->block_groups;
+                                /*
+                                 * we've allocated a new chunk, keep
+                                 * trying
+                                 */
+                                keep_going = 1;
+                                chunk_alloc_done = 1;
+                        } else if (!allowed_chunk_alloc) {
+                                space_info->force_alloc = 1;
+                        }
+loop_check:
+                        if (keep_going) {
+                                cur = head->next;
+                                extra_loop = 0;
+                        } else {
+                                break;
+                        }
+                } else if (cur == head) {
+                        break;
+                }
+                block_group = list_entry(cur, struct btrfs_block_group_cache,
+                                         list);
+                atomic_inc(&block_group->count);
+                search_start = block_group->key.objectid;
+                cur = cur->next;
+        }
+        /* we found what we needed */
+        if (ins->objectid) {
+                if (!(data & BTRFS_BLOCK_GROUP_DATA))
+                        trans->block_group = block_group->key.objectid;
+                if (last_ptr)
+                        *last_ptr = ins->objectid + ins->offset;
+                ret = 0;
+        } else if (!ret) {
+                printk(KERN_ERR "btrfs searching for %llu bytes, "
+                       "num_bytes %llu, loop %d, allowed_alloc %d\n",
+                       (unsigned long long)total_needed,
+                       (unsigned long long)num_bytes,
+                       loop, allowed_chunk_alloc);
+                ret = -ENOSPC;
+        }
+        if (block_group)
+                put_block_group(block_group);
+        up_read(&space_info->groups_sem);
+        return ret;
+}
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+{
+        struct btrfs_block_group_cache *cache;
+        struct list_head *l;
+        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+               (unsigned long long)(info->total_bytes - info->bytes_used -
+                                    info->bytes_pinned - info->bytes_reserved),
+               (info->full) ? "" : "not ");
+        down_read(&info->groups_sem);
+        list_for_each(l, &info->block_groups) {
+                cache = list_entry(l, struct btrfs_block_group_cache, list);
+                spin_lock(&cache->lock);
+                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
+                       "%llu pinned %llu reserved\n",
+                       (unsigned long long)cache->key.objectid,
+                       (unsigned long long)cache->key.offset,
+                       (unsigned long long)btrfs_block_group_used(&cache->item),
+                       (unsigned long long)cache->pinned,
+                       (unsigned long long)cache->reserved);
+                btrfs_dump_free_space(cache, bytes);
+                spin_unlock(&cache->lock);
+        }
+        up_read(&info->groups_sem);
+}
+static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  u64 num_bytes, u64 min_alloc_size,
+                                  u64 empty_size, u64 hint_byte,
+                                  u64 search_end, struct btrfs_key *ins,
+                                  u64 data)
+{
+        int ret;
+        u64 search_start = 0;
+        u64 alloc_profile;
+        struct btrfs_fs_info *info = root->fs_info;
+        if (data) {
+                alloc_profile = info->avail_data_alloc_bits &
+                        info->data_alloc_profile;
+                data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+        } else if (root == root->fs_info->chunk_root) {
+                alloc_profile = info->avail_system_alloc_bits &
+                        info->system_alloc_profile;
+                data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+        } else {
+                alloc_profile = info->avail_metadata_alloc_bits &
+                        info->metadata_alloc_profile;
+                data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+        }
+again:
+        data = btrfs_reduce_alloc_profile(root, data);
+        /*
+         * the only place that sets empty_size is btrfs_realloc_node, which
+         * is not called recursively on allocations
+         */
+        if (empty_size || root->ref_cows) {
+                if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
+                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                     2 * 1024 * 1024,
+                                     BTRFS_BLOCK_GROUP_METADATA |
+                                     (info->metadata_alloc_profile &
+                                      info->avail_metadata_alloc_bits), 0);
+                }
+                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                     num_bytes + 2 * 1024 * 1024, data, 0);
+        }
+        WARN_ON(num_bytes < root->sectorsize);
+        ret = find_free_extent(trans, root, num_bytes, empty_size,
+                               search_start, search_end, hint_byte, ins,
+                               trans->alloc_exclude_start,
+                               trans->alloc_exclude_nr, data);
+        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
+                num_bytes = num_bytes >> 1;
+                num_bytes = num_bytes & ~(root->sectorsize - 1);
+                num_bytes = max(num_bytes, min_alloc_size);
+                do_chunk_alloc(trans, root->fs_info->extent_root,
+                               num_bytes, data, 1);
+                goto again;
+        }
+        if (ret) {
+                struct btrfs_space_info *sinfo;
+                sinfo = __find_space_info(root->fs_info, data);
+                printk(KERN_ERR "btrfs allocation failed flags %llu, "
+                       "wanted %llu\n", (unsigned long long)data,
+                       (unsigned long long)num_bytes);
+                dump_space_info(sinfo, num_bytes);
+                BUG();
+        }
+        return ret;
+}
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+        struct btrfs_block_group_cache *cache;
+        int ret = 0;
+        cache = btrfs_lookup_block_group(root->fs_info, start);
+        if (!cache) {
+                printk(KERN_ERR "Unable to find block group for %llu\n",
+                       (unsigned long long)start);
+                return -ENOSPC;
+        }
+        ret = btrfs_discard_extent(root, start, len);
+        btrfs_add_free_space(cache, start, len);
+        put_block_group(cache);
+        update_reserved_extents(root, start, len, 0);
+        return ret;
+}
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  u64 num_bytes, u64 min_alloc_size,
+                                  u64 empty_size, u64 hint_byte,
+                                  u64 search_end, struct btrfs_key *ins,
+                                  u64 data)
+{
+        int ret;
+        ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+                                     empty_size, hint_byte, search_end, ins,
+                                     data);
+        update_reserved_extents(root, ins->objectid, ins->offset, 1);
+        return ret;
+}
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                         struct btrfs_root *root, u64 parent,
+                                         u64 root_objectid, u64 ref_generation,
+                                         u64 owner, struct btrfs_key *ins)
+{
+        int ret;
+        int pending_ret;
+        u64 super_used;
+        u64 root_used;
+        u64 num_bytes = ins->offset;
+        u32 sizes[2];
+        struct btrfs_fs_info *info = root->fs_info;
+        struct btrfs_root *extent_root = info->extent_root;
+        struct btrfs_extent_item *extent_item;
+        struct btrfs_extent_ref *ref;
+        struct btrfs_path *path;
+        struct btrfs_key keys[2];
+        if (parent == 0)
+                parent = ins->objectid;
+        /* block accounting for super block */
+        spin_lock(&info->delalloc_lock);
+        super_used = btrfs_super_bytes_used(&info->super_copy);
+        btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
+        /* block accounting for root item */
+        root_used = btrfs_root_used(&root->root_item);
+        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
+        spin_unlock(&info->delalloc_lock);
+        if (root == extent_root) {
+                struct pending_extent_op *extent_op;
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                BUG_ON(!extent_op);
+                extent_op->type = PENDING_EXTENT_INSERT;
+                extent_op->bytenr = ins->objectid;
+                extent_op->num_bytes = ins->offset;
+                extent_op->parent = parent;
+                extent_op->orig_parent = 0;
+                extent_op->generation = ref_generation;
+                extent_op->orig_generation = 0;
+                extent_op->level = (int)owner;
+                INIT_LIST_HEAD(&extent_op->list);
+                extent_op->del = 0;
+                mutex_lock(&root->fs_info->extent_ins_mutex);
+                set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
+                                ins->objectid + ins->offset - 1,
+                                EXTENT_WRITEBACK, GFP_NOFS);
+                set_state_private(&root->fs_info->extent_ins,
+                                  ins->objectid, (unsigned long)extent_op);
+                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                goto update_block;
+        }
+        memcpy(&keys[0], ins, sizeof(*ins));
+        keys[1].objectid = ins->objectid;
+        keys[1].type = BTRFS_EXTENT_REF_KEY;
+        keys[1].offset = parent;
+        sizes[0] = sizeof(*extent_item);
+        sizes[1] = sizeof(*ref);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
+                                       sizes, 2);
+        BUG_ON(ret);
+        extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                     struct btrfs_extent_item);
+        btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+                             struct btrfs_extent_ref);
+        btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
+        btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
+        btrfs_set_ref_objectid(path->nodes[0], ref, owner);
+        btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        trans->alloc_exclude_start = 0;
+        trans->alloc_exclude_nr = 0;
+        btrfs_free_path(path);
+        finish_current_insert(trans, extent_root, 0);
+        pending_ret = del_pending_extents(trans, extent_root, 0);
+        if (ret)
+                goto out;
+        if (pending_ret) {
+                ret = pending_ret;
+                goto out;
+        }
+update_block:
+        ret = update_block_group(trans, root, ins->objectid,
+                                 ins->offset, 1, 0);
+        if (ret) {
+                printk(KERN_ERR "btrfs update block group failed for %llu "
+                       "%llu\n", (unsigned long long)ins->objectid,
+                       (unsigned long long)ins->offset);
+                BUG();
+        }
+out:
+        return ret;
+}
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, u64 parent,
+                                u64 root_objectid, u64 ref_generation,
+                                u64 owner, struct btrfs_key *ins)
+{
+        int ret;
+        if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
+                return 0;
+        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+                                            ref_generation, owner, ins);
+        update_reserved_extents(root, ins->objectid, ins->offset, 0);
+        return ret;
+}
+/*
+ * this is used by the tree logging recovery code.  It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, u64 parent,
+                                u64 root_objectid, u64 ref_generation,
+                                u64 owner, struct btrfs_key *ins)
+{
+        int ret;
+        struct btrfs_block_group_cache *block_group;
+        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+        mutex_lock(&block_group->cache_mutex);
+        cache_block_group(root, block_group);
+        mutex_unlock(&block_group->cache_mutex);
+        ret = btrfs_remove_free_space(block_group, ins->objectid,
+                                      ins->offset);
+        BUG_ON(ret);
+        put_block_group(block_group);
+        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+                                            ref_generation, owner, ins);
+        return ret;
+}
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       u64 num_bytes, u64 parent, u64 min_alloc_size,
+                       u64 root_objectid, u64 ref_generation,
+                       u64 owner_objectid, u64 empty_size, u64 hint_byte,
+                       u64 search_end, struct btrfs_key *ins, u64 data)
+{
+        int ret;
+        ret = __btrfs_reserve_extent(trans, root, num_bytes,
+                                     min_alloc_size, empty_size, hint_byte,
+                                     search_end, ins, data);
+        BUG_ON(ret);
+        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+                ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+                                        root_objectid, ref_generation,
+                                        owner_objectid, ins);
+                BUG_ON(ret);
+        } else {
+                update_reserved_extents(root, ins->objectid, ins->offset, 1);
+        }
+        return ret;
+}
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+                                            struct btrfs_root *root,
+                                            u64 bytenr, u32 blocksize)
+{
+        struct extent_buffer *buf;
+        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        if (!buf)
+                return ERR_PTR(-ENOMEM);
+        btrfs_set_header_generation(buf, trans->transid);
+        btrfs_tree_lock(buf);
+        clean_tree_block(trans, root, buf);
+        btrfs_set_buffer_uptodate(buf);
+        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+                set_extent_dirty(&root->dirty_log_pages, buf->start,
+                         buf->start + buf->len - 1, GFP_NOFS);
+        } else {
+                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+                         buf->start + buf->len - 1, GFP_NOFS);
+        }
+        trans->blocks_used++;
+        return buf;
+}
+/*
+ * helper function to allocate a block for a given tree
+ * returns the tree buffer or NULL.
+ */
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root,
+                                             u32 blocksize, u64 parent,
+                                             u64 root_objectid,
+                                             u64 ref_generation,
+                                             int level,
+                                             u64 hint,
+                                             u64 empty_size)
+{
+        struct btrfs_key ins;
+        int ret;
+        struct extent_buffer *buf;
+        ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+                                 root_objectid, ref_generation, level,
+                                 empty_size, hint, (u64)-1, &ins, 0);
+        if (ret) {
+                BUG_ON(ret > 0);
+                return ERR_PTR(ret);
+        }
+        buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+        return buf;
+}
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, struct extent_buffer *leaf)
+{
+        u64 leaf_owner;
+        u64 leaf_generation;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        int i;
+        int nritems;
+        int ret;
+        BUG_ON(!btrfs_is_leaf(leaf));
+        nritems = btrfs_header_nritems(leaf);
+        leaf_owner = btrfs_header_owner(leaf);
+        leaf_generation = btrfs_header_generation(leaf);
+        for (i = 0; i < nritems; i++) {
+                u64 disk_bytenr;
+                cond_resched();
+                btrfs_item_key_to_cpu(leaf, &key, i);
+                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                /*
+                 * FIXME make sure to insert a trans record that
+                 * repeats the snapshot del on crash
+                 */
+                disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                if (disk_bytenr == 0)
+                        continue;
+                ret = __btrfs_free_extent(trans, root, disk_bytenr,
+                                btrfs_file_extent_disk_num_bytes(leaf, fi),
+                                leaf->start, leaf_owner, leaf_generation,
+                                key.objectid, 0);
+                BUG_ON(ret);
+                atomic_inc(&root->fs_info->throttle_gen);
+                wake_up(&root->fs_info->transaction_throttle);
+                cond_resched();
+        }
+        return 0;
+}
+static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_leaf_ref *ref)
+{
+        int i;
+        int ret;
+        struct btrfs_extent_info *info = ref->extents;
+        for (i = 0; i < ref->nritems; i++) {
+                ret = __btrfs_free_extent(trans, root, info->bytenr,
+                                          info->num_bytes, ref->bytenr,
+                                          ref->owner, ref->generation,
+                                          info->objectid, 0);
+                atomic_inc(&root->fs_info->throttle_gen);
+                wake_up(&root->fs_info->transaction_throttle);
+                cond_resched();
+                BUG_ON(ret);
+                info++;
+        }
+        return 0;
+}
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+                                     u64 len, u32 *refs)
+{
+        int ret;
+        ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+        BUG_ON(ret);
+#if 0 /* some debugging code in case we see problems here */
+        /* if the refs count is one, it won't get increased again.  But
+         * if the ref count is > 1, someone may be decreasing it at
+         * the same time we are.
+         */
+        if (*refs != 1) {
+                struct extent_buffer *eb = NULL;
+                eb = btrfs_find_create_tree_block(root, start, len);
+                if (eb)
+                        btrfs_tree_lock(eb);
+                mutex_lock(&root->fs_info->alloc_mutex);
+                ret = lookup_extent_ref(NULL, root, start, len, refs);
+                BUG_ON(ret);
+                mutex_unlock(&root->fs_info->alloc_mutex);
+                if (eb) {
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
+                }
+                if (*refs == 1) {
+                        printk(KERN_ERR "btrfs block %llu went down to one "
+                               "during drop_snap\n", (unsigned long long)start);
+                }
+        }
+#endif
+        cond_resched();
+        return ret;
+}
+/*
+ * helper function for drop_snapshot, this walks down the tree dropping ref
+ * counts as it goes.
+ */
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path, int *level)
+{
+        u64 root_owner;
+        u64 root_gen;
+        u64 bytenr;
+        u64 ptr_gen;
+        struct extent_buffer *next;
+        struct extent_buffer *cur;
+        struct extent_buffer *parent;
+        struct btrfs_leaf_ref *ref;
+        u32 blocksize;
+        int ret;
+        u32 refs;
+        WARN_ON(*level < 0);
+        WARN_ON(*level >= BTRFS_MAX_LEVEL);
+        ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+                                path->nodes[*level]->len, &refs);
+        BUG_ON(ret);
+        if (refs > 1)
+                goto out;
+        /*
+         * walk down to the last node level and free all the leaves
+         */
+        while (*level >= 0) {
+                WARN_ON(*level < 0);
+                WARN_ON(*level >= BTRFS_MAX_LEVEL);
+                cur = path->nodes[*level];
+                if (btrfs_header_level(cur) != *level)
+                        WARN_ON(1);
+                if (path->slots[*level] >=
+                    btrfs_header_nritems(cur))
+                        break;
+                if (*level == 0) {
+                        ret = btrfs_drop_leaf_ref(trans, root, cur);
+                        BUG_ON(ret);
+                        break;
+                }
+                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+                blocksize = btrfs_level_size(root, *level - 1);
+                ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+                BUG_ON(ret);
+                if (refs != 1) {
+                        parent = path->nodes[*level];
+                        root_owner = btrfs_header_owner(parent);
+                        root_gen = btrfs_header_generation(parent);
+                        path->slots[*level]++;
+                        ret = __btrfs_free_extent(trans, root, bytenr,
+                                                blocksize, parent->start,
+                                                root_owner, root_gen,
+                                                *level - 1, 1);
+                        BUG_ON(ret);
+                        atomic_inc(&root->fs_info->throttle_gen);
+                        wake_up(&root->fs_info->transaction_throttle);
+                        cond_resched();
+                        continue;
+                }
+                /*
+                 * at this point, we have a single ref, and since the
+                 * only place referencing this extent is a dead root
+                 * the reference count should never go higher.
+                 * So, we don't need to check it again
+                 */
+                if (*level == 1) {
+                        ref = btrfs_lookup_leaf_ref(root, bytenr);
+                        if (ref && ref->generation != ptr_gen) {
+                                btrfs_free_leaf_ref(root, ref);
+                                ref = NULL;
+                        }
+                        if (ref) {
+                                ret = cache_drop_leaf_ref(trans, root, ref);
+                                BUG_ON(ret);
+                                btrfs_remove_leaf_ref(root, ref);
+                                btrfs_free_leaf_ref(root, ref);
+                                *level = 0;
+                                break;
+                        }
+                }
+                next = btrfs_find_tree_block(root, bytenr, blocksize);
+                if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
+                        free_extent_buffer(next);
+                        next = read_tree_block(root, bytenr, blocksize,
+                                               ptr_gen);
+                        cond_resched();
+#if 0
+                        /*
+                         * this is a debugging check and can go away
+                         * the ref should never go all the way down to 1
+                         * at this point
+                         */
+                        ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
+                                                &refs);
+                        BUG_ON(ret);
+                        WARN_ON(refs != 1);
+#endif
+                }
+                WARN_ON(*level <= 0);
+                if (path->nodes[*level-1])
+                        free_extent_buffer(path->nodes[*level-1]);
+                path->nodes[*level-1] = next;
+                *level = btrfs_header_level(next);
+                path->slots[*level] = 0;
+                cond_resched();
+        }
+out:
+        WARN_ON(*level < 0);
+        WARN_ON(*level >= BTRFS_MAX_LEVEL);
+        if (path->nodes[*level] == root->node) {
+                parent = path->nodes[*level];
+                bytenr = path->nodes[*level]->start;
+        } else {
+                parent = path->nodes[*level + 1];
+                bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+        }
+        blocksize = btrfs_level_size(root, *level);
+        root_owner = btrfs_header_owner(parent);
+        root_gen = btrfs_header_generation(parent);
+        ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+                                  parent->start, root_owner, root_gen,
+                                  *level, 1);
+        free_extent_buffer(path->nodes[*level]);
+        path->nodes[*level] = NULL;
+        *level += 1;
+        BUG_ON(ret);
+        cond_resched();
+        return 0;
+}
+/*
+ * helper function for drop_subtree, this function is similar to
+ * walk_down_tree. The main difference is that it checks reference
+ * counts while tree blocks are locked.
+ */
+static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path, int *level)
+{
+        struct extent_buffer *next;
+        struct extent_buffer *cur;
+        struct extent_buffer *parent;
+        u64 bytenr;
+        u64 ptr_gen;
+        u32 blocksize;
+        u32 refs;
+        int ret;
+        cur = path->nodes[*level];
+        ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+                                      &refs);
+        BUG_ON(ret);
+        if (refs > 1)
+                goto out;
+        while (*level >= 0) {
+                cur = path->nodes[*level];
+                if (*level == 0) {
+                        ret = btrfs_drop_leaf_ref(trans, root, cur);
+                        BUG_ON(ret);
+                        clean_tree_block(trans, root, cur);
+                        break;
+                }
+                if (path->slots[*level] >= btrfs_header_nritems(cur)) {
+                        clean_tree_block(trans, root, cur);
+                        break;
+                }
+                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+                blocksize = btrfs_level_size(root, *level - 1);
+                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+                btrfs_tree_lock(next);
+                ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+                                              &refs);
+                BUG_ON(ret);
+                if (refs > 1) {
+                        parent = path->nodes[*level];
+                        ret = btrfs_free_extent(trans, root, bytenr,
+                                        blocksize, parent->start,
+                                        btrfs_header_owner(parent),
+                                        btrfs_header_generation(parent),
+                                        *level - 1, 1);
+                        BUG_ON(ret);
+                        path->slots[*level]++;
+                        btrfs_tree_unlock(next);
+                        free_extent_buffer(next);
+                        continue;
+                }
+                *level = btrfs_header_level(next);
+                path->nodes[*level] = next;
+                path->slots[*level] = 0;
+                path->locks[*level] = 1;
+                cond_resched();
+        }
+out:
+        parent = path->nodes[*level + 1];
+        bytenr = path->nodes[*level]->start;
+        blocksize = path->nodes[*level]->len;
+        ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+                        parent->start, btrfs_header_owner(parent),
+                        btrfs_header_generation(parent), *level, 1);
+        BUG_ON(ret);
+        if (path->locks[*level]) {
+                btrfs_tree_unlock(path->nodes[*level]);
+                path->locks[*level] = 0;
+        }
+        free_extent_buffer(path->nodes[*level]);
+        path->nodes[*level] = NULL;
+        *level += 1;
+        cond_resched();
+        return 0;
+}
+/*
+ * helper for dropping snapshots.  This walks back up the tree in the path
+ * to find the first node higher up where we haven't yet gone through
+ * all the slots
+ */
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 int *level, int max_level)
+{
+        u64 root_owner;
+        u64 root_gen;
+        struct btrfs_root_item *root_item = &root->root_item;
+        int i;
+        int slot;
+        int ret;
+        for (i = *level; i < max_level && path->nodes[i]; i++) {
+                slot = path->slots[i];
+                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                        struct extent_buffer *node;
+                        struct btrfs_disk_key disk_key;
+                        node = path->nodes[i];
+                        path->slots[i]++;
+                        *level = i;
+                        WARN_ON(*level == 0);
+                        btrfs_node_key(node, &disk_key, path->slots[i]);
+                        memcpy(&root_item->drop_progress,
+                               &disk_key, sizeof(disk_key));
+                        root_item->drop_level = i;
+                        return 0;
+                } else {
+                        struct extent_buffer *parent;
+                        if (path->nodes[*level] == root->node)
+                                parent = path->nodes[*level];
+                        else
+                                parent = path->nodes[*level + 1];
+                        root_owner = btrfs_header_owner(parent);
+                        root_gen = btrfs_header_generation(parent);
+                        clean_tree_block(trans, root, path->nodes[*level]);
+                        ret = btrfs_free_extent(trans, root,
+                                                path->nodes[*level]->start,
+                                                path->nodes[*level]->len,
+                                                parent->start, root_owner,
+                                                root_gen, *level, 1);
+                        BUG_ON(ret);
+                        if (path->locks[*level]) {
+                                btrfs_tree_unlock(path->nodes[*level]);
+                                path->locks[*level] = 0;
+                        }
+                        free_extent_buffer(path->nodes[*level]);
+                        path->nodes[*level] = NULL;
+                        *level = i + 1;
+                }
+        }
+        return 1;
+}
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+                        *root)
+{
+        int ret = 0;
+        int wret;
+        int level;
+        struct btrfs_path *path;
+        int i;
+        int orig_level;
+        struct btrfs_root_item *root_item = &root->root_item;
+        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        level = btrfs_header_level(root->node);
+        orig_level = level;
+        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+                path->nodes[level] = root->node;
+                extent_buffer_get(root->node);
+                path->slots[level] = 0;
+        } else {
+                struct btrfs_key key;
+                struct btrfs_disk_key found_key;
+                struct extent_buffer *node;
+                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+                level = root_item->drop_level;
+                path->lowest_level = level;
+                wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (wret < 0) {
+                        ret = wret;
+                        goto out;
+                }
+                node = path->nodes[level];
+                btrfs_node_key(node, &found_key, path->slots[level]);
+                WARN_ON(memcmp(&found_key, &root_item->drop_progress,
+                               sizeof(found_key)));
+                /*
+                 * unlock our path, this is safe because only this
+                 * function is allowed to delete this snapshot
+                 */
+                for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+                        if (path->nodes[i] && path->locks[i]) {
+                                path->locks[i] = 0;
+                                btrfs_tree_unlock(path->nodes[i]);
+                        }
+                }
+        }
+        while (1) {
+                wret = walk_down_tree(trans, root, path, &level);
+                if (wret > 0)
+                        break;
+                if (wret < 0)
+                        ret = wret;
+                wret = walk_up_tree(trans, root, path, &level,
+                                    BTRFS_MAX_LEVEL);
+                if (wret > 0)
+                        break;
+                if (wret < 0)
+                        ret = wret;
+                if (trans->transaction->in_commit) {
+                        ret = -EAGAIN;
+                        break;
+                }
+                atomic_inc(&root->fs_info->throttle_gen);
+                wake_up(&root->fs_info->transaction_throttle);
+        }
+        for (i = 0; i <= orig_level; i++) {
+                if (path->nodes[i]) {
+                        free_extent_buffer(path->nodes[i]);
+                        path->nodes[i] = NULL;
+                }
+        }
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct extent_buffer *node,
+                        struct extent_buffer *parent)
+{
+        struct btrfs_path *path;
+        int level;
+        int parent_level;
+        int ret = 0;
+        int wret;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        BUG_ON(!btrfs_tree_locked(parent));
+        parent_level = btrfs_header_level(parent);
+        extent_buffer_get(parent);
+        path->nodes[parent_level] = parent;
+        path->slots[parent_level] = btrfs_header_nritems(parent);
+        BUG_ON(!btrfs_tree_locked(node));
+        level = btrfs_header_level(node);
+        extent_buffer_get(node);
+        path->nodes[level] = node;
+        path->slots[level] = 0;
+        while (1) {
+                wret = walk_down_subtree(trans, root, path, &level);
+                if (wret < 0)
+                        ret = wret;
+                if (wret != 0)
+                        break;
+                wret = walk_up_tree(trans, root, path, &level, parent_level);
+                if (wret < 0)
+                        ret = wret;
+                if (wret != 0)
+                        break;
+        }
+        btrfs_free_path(path);
+        return ret;
+}
+static unsigned long calc_ra(unsigned long start, unsigned long last,
+                             unsigned long nr)
+{
+        return min(last, start + nr - 1);
+}
+static noinline int relocate_inode_pages(struct inode *inode, u64 start,
+                                         u64 len)
+{
+        u64 page_start;
+        u64 page_end;
+        unsigned long first_index;
+        unsigned long last_index;
+        unsigned long i;
+        struct page *page;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct file_ra_state *ra;
+        struct btrfs_ordered_extent *ordered;
+        unsigned int total_read = 0;
+        unsigned int total_dirty = 0;
+        int ret = 0;
+        ra = kzalloc(sizeof(*ra), GFP_NOFS);
+        mutex_lock(&inode->i_mutex);
+        first_index = start >> PAGE_CACHE_SHIFT;
+        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+        /* make sure the dirty trick played by the caller work */
+        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                            first_index, last_index);
+        if (ret)
+                goto out_unlock;
+        file_ra_state_init(ra, inode->i_mapping);
+        for (i = first_index ; i <= last_index; i++) {
+                if (total_read % ra->ra_pages == 0) {
+                        btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+                                       calc_ra(i, last_index, ra->ra_pages));
+                }
+                total_read++;
+again:
+                if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+                        BUG_ON(1);
+                page = grab_cache_page(inode->i_mapping, i);
+                if (!page) {
+                        ret = -ENOMEM;
+                        goto out_unlock;
+                }
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                ret = -EIO;
+                                goto out_unlock;
+                        }
+                }
+                wait_on_page_writeback(page);
+                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+                page_end = page_start + PAGE_CACHE_SIZE - 1;
+                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, page_start);
+                if (ordered) {
+                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                        goto again;
+                }
+                set_page_extent_mapped(page);
+                if (i == first_index)
+                        set_extent_bits(io_tree, page_start, page_end,
+                                        EXTENT_BOUNDARY, GFP_NOFS);
+                btrfs_set_extent_delalloc(inode, page_start, page_end);
+                set_page_dirty(page);
+                total_dirty++;
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_page(page);
+                page_cache_release(page);
+        }
+out_unlock:
+        kfree(ra);
+        mutex_unlock(&inode->i_mutex);
+        balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+        return ret;
+}
+static noinline int relocate_data_extent(struct inode *reloc_inode,
+                                         struct btrfs_key *extent_key,
+                                         u64 offset)
+{
+        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+        struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
+        struct extent_map *em;
+        u64 start = extent_key->objectid - offset;
+        u64 end = start + extent_key->offset - 1;
+        em = alloc_extent_map(GFP_NOFS);
+        BUG_ON(!em || IS_ERR(em));
+        em->start = start;
+        em->len = extent_key->offset;
+        em->block_len = extent_key->offset;
+        em->block_start = extent_key->objectid;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        /* setup extent map to cheat btrfs_readpage */
+        lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+        while (1) {
+                int ret;
+                spin_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                spin_unlock(&em_tree->lock);
+                if (ret != -EEXIST) {
+                        free_extent_map(em);
+                        break;
+                }
+                btrfs_drop_extent_cache(reloc_inode, start, end, 0);
+        }
+        unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
+        return relocate_inode_pages(reloc_inode, start, extent_key->offset);
+}
+struct btrfs_ref_path {
+        u64 extent_start;
+        u64 nodes[BTRFS_MAX_LEVEL];
+        u64 root_objectid;
+        u64 root_generation;
+        u64 owner_objectid;
+        u32 num_refs;
+        int lowest_level;
+        int current_level;
+        int shared_level;
+        struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
+        u64 new_nodes[BTRFS_MAX_LEVEL];
+};
+struct disk_extent {
+        u64 ram_bytes;
+        u64 disk_bytenr;
+        u64 disk_num_bytes;
+        u64 offset;
+        u64 num_bytes;
+        u8 compression;
+        u8 encryption;
+        u16 other_encoding;
+};
+static int is_cowonly_root(u64 root_objectid)
+{
+        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+                return 1;
+        return 0;
+}
+static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *extent_root,
+                                    struct btrfs_ref_path *ref_path,
+                                    int first_time)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_path *path;
+        struct btrfs_extent_ref *ref;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        u64 bytenr;
+        u32 nritems;
+        int level;
+        int ret = 1;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        if (first_time) {
+                ref_path->lowest_level = -1;
+                ref_path->current_level = -1;
+                ref_path->shared_level = -1;
+                goto walk_up;
+        }
+walk_down:
+        level = ref_path->current_level - 1;
+        while (level >= -1) {
+                u64 parent;
+                if (level < ref_path->lowest_level)
+                        break;
+                if (level >= 0)
+                        bytenr = ref_path->nodes[level];
+                else
+                        bytenr = ref_path->extent_start;
+                BUG_ON(bytenr == 0);
+                parent = ref_path->nodes[level + 1];
+                ref_path->nodes[level + 1] = 0;
+                ref_path->current_level = level;
+                BUG_ON(parent == 0);
+                key.objectid = bytenr;
+                key.offset = parent + 1;
+                key.type = BTRFS_EXTENT_REF_KEY;
+                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                BUG_ON(ret == 0);
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(extent_root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0)
+                                goto next;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid == bytenr &&
+                    found_key.type == BTRFS_EXTENT_REF_KEY) {
+                        if (level < ref_path->shared_level)
+                                ref_path->shared_level = level;
+                        goto found;
+                }
+next:
+                level--;
+                btrfs_release_path(extent_root, path);
+                cond_resched();
+        }
+        /* reached lowest level */
+        ret = 1;
+        goto out;
+walk_up:
+        level = ref_path->current_level;
+        while (level < BTRFS_MAX_LEVEL - 1) {
+                u64 ref_objectid;
+                if (level >= 0)
+                        bytenr = ref_path->nodes[level];
+                else
+                        bytenr = ref_path->extent_start;
+                BUG_ON(bytenr == 0);
+                key.objectid = bytenr;
+                key.offset = 0;
+                key.type = BTRFS_EXTENT_REF_KEY;
+                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(extent_root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0) {
+                                /* the extent was freed by someone */
+                                if (ref_path->lowest_level == level)
+                                        goto out;
+                                btrfs_release_path(extent_root, path);
+                                goto walk_down;
+                        }
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid != bytenr ||
+                                found_key.type != BTRFS_EXTENT_REF_KEY) {
+                        /* the extent was freed by someone */
+                        if (ref_path->lowest_level == level) {
+                                ret = 1;
+                                goto out;
+                        }
+                        btrfs_release_path(extent_root, path);
+                        goto walk_down;
+                }
+found:
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_extent_ref);
+                ref_objectid = btrfs_ref_objectid(leaf, ref);
+                if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                        if (first_time) {
+                                level = (int)ref_objectid;
+                                BUG_ON(level >= BTRFS_MAX_LEVEL);
+                                ref_path->lowest_level = level;
+                                ref_path->current_level = level;
+                                ref_path->nodes[level] = bytenr;
+                        } else {
+                                WARN_ON(ref_objectid != level);
+                        }
+                } else {
+                        WARN_ON(level != -1);
+                }
+                first_time = 0;
+                if (ref_path->lowest_level == level) {
+                        ref_path->owner_objectid = ref_objectid;
+                        ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
+                }
+                /*
+                 * the block is tree root or the block isn't in reference
+                 * counted tree.
+                 */
+                if (found_key.objectid == found_key.offset ||
+                    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
+                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+                        ref_path->root_generation =
+                                btrfs_ref_generation(leaf, ref);
+                        if (level < 0) {
+                                /* special reference from the tree log */
+                                ref_path->nodes[0] = found_key.offset;
+                                ref_path->current_level = 0;
+                        }
+                        ret = 0;
+                        goto out;
+                }
+                level++;
+                BUG_ON(ref_path->nodes[level] != 0);
+                ref_path->nodes[level] = found_key.offset;
+                ref_path->current_level = level;
+                /*
+                 * the reference was created in the running transaction,
+                 * no need to continue walking up.
+                 */
+                if (btrfs_ref_generation(leaf, ref) == trans->transid) {
+                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+                        ref_path->root_generation =
+                                btrfs_ref_generation(leaf, ref);
+                        ret = 0;
+                        goto out;
+                }
+                btrfs_release_path(extent_root, path);
+                cond_resched();
+        }
+        /* reached max tree level, but no tree root found. */
+        BUG();
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *extent_root,
+                                struct btrfs_ref_path *ref_path,
+                                u64 extent_start)
+{
+        memset(ref_path, 0, sizeof(*ref_path));
+        ref_path->extent_start = extent_start;
+        return __next_ref_path(trans, extent_root, ref_path, 1);
+}
+static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root,
+                               struct btrfs_ref_path *ref_path)
+{
+        return __next_ref_path(trans, extent_root, ref_path, 0);
+}
+static noinline int get_new_locations(struct inode *reloc_inode,
+                                      struct btrfs_key *extent_key,
+                                      u64 offset, int no_fragment,
+                                      struct disk_extent **extents,
+                                      int *nr_extents)
+{
+        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        struct disk_extent *exts = *extents;
+        struct btrfs_key found_key;
+        u64 cur_pos;
+        u64 last_byte;
+        u32 nritems;
+        int nr = 0;
+        int max = *nr_extents;
+        int ret;
+        WARN_ON(!no_fragment && *extents);
+        if (!exts) {
+                max = 1;
+                exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
+                if (!exts)
+                        return -ENOMEM;
+        }
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        cur_pos = extent_key->objectid - offset;
+        last_byte = extent_key->objectid + extent_key->offset;
+        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+                                       cur_pos, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0)
+                                break;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.offset != cur_pos ||
+                    found_key.type != BTRFS_EXTENT_DATA_KEY ||
+                    found_key.objectid != reloc_inode->i_ino)
+                        break;
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) !=
+                    BTRFS_FILE_EXTENT_REG ||
+                    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+                        break;
+                if (nr == max) {
+                        struct disk_extent *old = exts;
+                        max *= 2;
+                        exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
+                        memcpy(exts, old, sizeof(*exts) * nr);
+                        if (old != *extents)
+                                kfree(old);
+                }
+                exts[nr].disk_bytenr =
+                        btrfs_file_extent_disk_bytenr(leaf, fi);
+                exts[nr].disk_num_bytes =
+                        btrfs_file_extent_disk_num_bytes(leaf, fi);
+                exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
+                exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+                exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+                exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+                exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+                exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                           fi);
+                BUG_ON(exts[nr].offset > 0);
+                BUG_ON(exts[nr].compression || exts[nr].encryption);
+                BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
+                cur_pos += exts[nr].num_bytes;
+                nr++;
+                if (cur_pos + offset >= last_byte)
+                        break;
+                if (no_fragment) {
+                        ret = 1;
+                        goto out;
+                }
+                path->slots[0]++;
+        }
+        BUG_ON(cur_pos + offset > last_byte);
+        if (cur_pos + offset < last_byte) {
+                ret = -ENOENT;
+                goto out;
+        }
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        if (ret) {
+                if (exts != *extents)
+                        kfree(exts);
+        } else {
+                *extents = exts;
+                *nr_extents = nr;
+        }
+        return ret;
+}
+static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_path *path,
+                                        struct btrfs_key *extent_key,
+                                        struct btrfs_key *leaf_key,
+                                        struct btrfs_ref_path *ref_path,
+                                        struct disk_extent *new_extents,
+                                        int nr_extents)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *fi;
+        struct inode *inode = NULL;
+        struct btrfs_key key;
+        u64 lock_start = 0;
+        u64 lock_end = 0;
+        u64 num_bytes;
+        u64 ext_offset;
+        u64 first_pos;
+        u32 nritems;
+        int nr_scaned = 0;
+        int extent_locked = 0;
+        int extent_type;
+        int ret;
+        memcpy(&key, leaf_key, sizeof(key));
+        first_pos = INT_LIMIT(loff_t) - extent_key->offset;
+        if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+                if (key.objectid < ref_path->owner_objectid ||
+                    (key.objectid == ref_path->owner_objectid &&
+                     key.type < BTRFS_EXTENT_DATA_KEY)) {
+                        key.objectid = ref_path->owner_objectid;
+                        key.type = BTRFS_EXTENT_DATA_KEY;
+                        key.offset = 0;
+                }
+        }
+        while (1) {
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                if (ret < 0)
+                        goto out;
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+next:
+                if (extent_locked && ret > 0) {
+                        /*
+                         * the file extent item was modified by someone
+                         * before the extent got locked.
+                         */
+                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+                                      lock_end, GFP_NOFS);
+                        extent_locked = 0;
+                }
+                if (path->slots[0] >= nritems) {
+                        if (++nr_scaned > 2)
+                                break;
+                        BUG_ON(extent_locked);
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0)
+                                break;
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+                        if ((key.objectid > ref_path->owner_objectid) ||
+                            (key.objectid == ref_path->owner_objectid &&
+                             key.type > BTRFS_EXTENT_DATA_KEY) ||
+                            (key.offset >= first_pos + extent_key->offset))
+                                break;
+                }
+                if (inode && key.objectid != inode->i_ino) {
+                        BUG_ON(extent_locked);
+                        btrfs_release_path(root, path);
+                        mutex_unlock(&inode->i_mutex);
+                        iput(inode);
+                        inode = NULL;
+                        continue;
+                }
+                if (key.type != BTRFS_EXTENT_DATA_KEY) {
+                        path->slots[0]++;
+                        ret = 1;
+                        goto next;
+                }
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                extent_type = btrfs_file_extent_type(leaf, fi);
+                if ((extent_type != BTRFS_FILE_EXTENT_REG &&
+                     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
+                    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+                     extent_key->objectid)) {
+                        path->slots[0]++;
+                        ret = 1;
+                        goto next;
+                }
+                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+                ext_offset = btrfs_file_extent_offset(leaf, fi);
+                if (first_pos > key.offset - ext_offset)
+                        first_pos = key.offset - ext_offset;
+                if (!extent_locked) {
+                        lock_start = key.offset;
+                        lock_end = lock_start + num_bytes - 1;
+                } else {
+                        if (lock_start > key.offset ||
+                            lock_end + 1 < key.offset + num_bytes) {
+                                unlock_extent(&BTRFS_I(inode)->io_tree,
+                                              lock_start, lock_end, GFP_NOFS);
+                                extent_locked = 0;
+                        }
+                }
+                if (!inode) {
+                        btrfs_release_path(root, path);
+                        inode = btrfs_iget_locked(root->fs_info->sb,
+                                                  key.objectid, root);
+                        if (inode->i_state & I_NEW) {
+                                BTRFS_I(inode)->root = root;
+                                BTRFS_I(inode)->location.objectid =
+                                        key.objectid;
+                                BTRFS_I(inode)->location.type =
+                                        BTRFS_INODE_ITEM_KEY;
+                                BTRFS_I(inode)->location.offset = 0;
+                                btrfs_read_locked_inode(inode);
+                                unlock_new_inode(inode);
+                        }
+                        /*
+                         * some code call btrfs_commit_transaction while
+                         * holding the i_mutex, so we can't use mutex_lock
+                         * here.
+                         */
+                        if (is_bad_inode(inode) ||
+                            !mutex_trylock(&inode->i_mutex)) {
+                                iput(inode);
+                                inode = NULL;
+                                key.offset = (u64)-1;
+                                goto skip;
+                        }
+                }
+                if (!extent_locked) {
+                        struct btrfs_ordered_extent *ordered;
+                        btrfs_release_path(root, path);
+                        lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+                                    lock_end, GFP_NOFS);
+                        ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                                    lock_end);
+                        if (ordered &&
+                            ordered->file_offset <= lock_end &&
+                            ordered->file_offset + ordered->len > lock_start) {
+                                unlock_extent(&BTRFS_I(inode)->io_tree,
+                                              lock_start, lock_end, GFP_NOFS);
+                                btrfs_start_ordered_extent(inode, ordered, 1);
+                                btrfs_put_ordered_extent(ordered);
+                                key.offset += num_bytes;
+                                goto skip;
+                        }
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        extent_locked = 1;
+                        continue;
+                }
+                if (nr_extents == 1) {
+                        /* update extent pointer in place */
+                        btrfs_set_file_extent_disk_bytenr(leaf, fi,
+                                                new_extents[0].disk_bytenr);
+                        btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+                                                new_extents[0].disk_num_bytes);
+                        btrfs_mark_buffer_dirty(leaf);
+                        btrfs_drop_extent_cache(inode, key.offset,
+                                                key.offset + num_bytes - 1, 0);
+                        ret = btrfs_inc_extent_ref(trans, root,
+                                                new_extents[0].disk_bytenr,
+                                                new_extents[0].disk_num_bytes,
+                                                leaf->start,
+                                                root->root_key.objectid,
+                                                trans->transid,
+                                                key.objectid);
+                        BUG_ON(ret);
+                        ret = btrfs_free_extent(trans, root,
+                                                extent_key->objectid,
+                                                extent_key->offset,
+                                                leaf->start,
+                                                btrfs_header_owner(leaf),
+                                                btrfs_header_generation(leaf),
+                                                key.objectid, 0);
+                        BUG_ON(ret);
+                        btrfs_release_path(root, path);
+                        key.offset += num_bytes;
+                } else {
+                        BUG_ON(1);
+#if 0
+                        u64 alloc_hint;
+                        u64 extent_len;
+                        int i;
+                        /*
+                         * drop old extent pointer at first, then insert the
+                         * new pointers one bye one
+                         */
+                        btrfs_release_path(root, path);
+                        ret = btrfs_drop_extents(trans, root, inode, key.offset,
+                                                 key.offset + num_bytes,
+                                                 key.offset, &alloc_hint);
+                        BUG_ON(ret);
+                        for (i = 0; i < nr_extents; i++) {
+                                if (ext_offset >= new_extents[i].num_bytes) {
+                                        ext_offset -= new_extents[i].num_bytes;
+                                        continue;
+                                }
+                                extent_len = min(new_extents[i].num_bytes -
+                                                 ext_offset, num_bytes);
+                                ret = btrfs_insert_empty_item(trans, root,
+                                                              path, &key,
+                                                              sizeof(*fi));
+                                BUG_ON(ret);
+                                leaf = path->nodes[0];
+                                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                                struct btrfs_file_extent_item);
+                                btrfs_set_file_extent_generation(leaf, fi,
+                                                        trans->transid);
+                                btrfs_set_file_extent_type(leaf, fi,
+                                                        BTRFS_FILE_EXTENT_REG);
+                                btrfs_set_file_extent_disk_bytenr(leaf, fi,
+                                                new_extents[i].disk_bytenr);
+                                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+                                                new_extents[i].disk_num_bytes);
+                                btrfs_set_file_extent_ram_bytes(leaf, fi,
+                                                new_extents[i].ram_bytes);
+                                btrfs_set_file_extent_compression(leaf, fi,
+                                                new_extents[i].compression);
+                                btrfs_set_file_extent_encryption(leaf, fi,
+                                                new_extents[i].encryption);
+                                btrfs_set_file_extent_other_encoding(leaf, fi,
+                                                new_extents[i].other_encoding);
+                                btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        extent_len);
+                                ext_offset += new_extents[i].offset;
+                                btrfs_set_file_extent_offset(leaf, fi,
+                                                        ext_offset);
+                                btrfs_mark_buffer_dirty(leaf);
+                                btrfs_drop_extent_cache(inode, key.offset,
+                                                key.offset + extent_len - 1, 0);
+                                ret = btrfs_inc_extent_ref(trans, root,
+                                                new_extents[i].disk_bytenr,
+                                                new_extents[i].disk_num_bytes,
+                                                leaf->start,
+                                                root->root_key.objectid,
+                                                trans->transid, key.objectid);
+                                BUG_ON(ret);
+                                btrfs_release_path(root, path);
+                                inode_add_bytes(inode, extent_len);
+                                ext_offset = 0;
+                                num_bytes -= extent_len;
+                                key.offset += extent_len;
+                                if (num_bytes == 0)
+                                        break;
+                        }
+                        BUG_ON(i >= nr_extents);
+#endif
+                }
+                if (extent_locked) {
+                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+                                      lock_end, GFP_NOFS);
+                        extent_locked = 0;
+                }
+skip:
+                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
+                    key.offset >= first_pos + extent_key->offset)
+                        break;
+                cond_resched();
+        }
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        if (inode) {
+                mutex_unlock(&inode->i_mutex);
+                if (extent_locked) {
+                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+                                      lock_end, GFP_NOFS);
+                }
+                iput(inode);
+        }
+        return ret;
+}
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct extent_buffer *buf, u64 orig_start)
+{
+        int level;
+        int ret;
+        BUG_ON(btrfs_header_generation(buf) != trans->transid);
+        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+        level = btrfs_header_level(buf);
+        if (level == 0) {
+                struct btrfs_leaf_ref *ref;
+                struct btrfs_leaf_ref *orig_ref;
+                orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
+                if (!orig_ref)
+                        return -ENOENT;
+                ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
+                if (!ref) {
+                        btrfs_free_leaf_ref(root, orig_ref);
+                        return -ENOMEM;
+                }
+                ref->nritems = orig_ref->nritems;
+                memcpy(ref->extents, orig_ref->extents,
+                        sizeof(ref->extents[0]) * ref->nritems);
+                btrfs_free_leaf_ref(root, orig_ref);
+                ref->root_gen = trans->transid;
+                ref->bytenr = buf->start;
+                ref->owner = btrfs_header_owner(buf);
+                ref->generation = btrfs_header_generation(buf);
+                ret = btrfs_add_leaf_ref(root, ref, 0);
+                WARN_ON(ret);
+                btrfs_free_leaf_ref(root, ref);
+        }
+        return 0;
+}
+static noinline int invalidate_extent_cache(struct btrfs_root *root,
+                                        struct extent_buffer *leaf,
+                                        struct btrfs_block_group_cache *group,
+                                        struct btrfs_root *target_root)
+{
+        struct btrfs_key key;
+        struct inode *inode = NULL;
+        struct btrfs_file_extent_item *fi;
+        u64 num_bytes;
+        u64 skip_objectid = 0;
+        u32 nritems;
+        u32 i;
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                btrfs_item_key_to_cpu(leaf, &key, i);
+                if (key.objectid == skip_objectid ||
+                    key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+                        continue;
+                if (!inode || inode->i_ino != key.objectid) {
+                        iput(inode);
+                        inode = btrfs_ilookup(target_root->fs_info->sb,
+                                              key.objectid, target_root, 1);
+                }
+                if (!inode) {
+                        skip_objectid = key.objectid;
+                        continue;
+                }
+                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+                lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+                            key.offset + num_bytes - 1, GFP_NOFS);
+                btrfs_drop_extent_cache(inode, key.offset,
+                                        key.offset + num_bytes - 1, 1);
+                unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+                              key.offset + num_bytes - 1, GFP_NOFS);
+                cond_resched();
+        }
+        iput(inode);
+        return 0;
+}
+static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct extent_buffer *leaf,
+                                        struct btrfs_block_group_cache *group,
+                                        struct inode *reloc_inode)
+{
+        struct btrfs_key key;
+        struct btrfs_key extent_key;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_leaf_ref *ref;
+        struct disk_extent *new_extent;
+        u64 bytenr;
+        u64 num_bytes;
+        u32 nritems;
+        u32 i;
+        int ext_index;
+        int nr_extent;
+        int ret;
+        new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
+        BUG_ON(!new_extent);
+        ref = btrfs_lookup_leaf_ref(root, leaf->start);
+        BUG_ON(!ref);
+        ext_index = -1;
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                btrfs_item_key_to_cpu(leaf, &key, i);
+                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+                if (bytenr == 0)
+                        continue;
+                ext_index++;
+                if (bytenr >= group->key.objectid + group->key.offset ||
+                    bytenr + num_bytes <= group->key.objectid)
+                        continue;
+                extent_key.objectid = bytenr;
+                extent_key.offset = num_bytes;
+                extent_key.type = BTRFS_EXTENT_ITEM_KEY;
+                nr_extent = 1;
+                ret = get_new_locations(reloc_inode, &extent_key,
+                                        group->key.objectid, 1,
+                                        &new_extent, &nr_extent);
+                if (ret > 0)
+                        continue;
+                BUG_ON(ret < 0);
+                BUG_ON(ref->extents[ext_index].bytenr != bytenr);
+                BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
+                ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
+                ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
+                btrfs_set_file_extent_disk_bytenr(leaf, fi,
+                                                new_extent->disk_bytenr);
+                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+                                                new_extent->disk_num_bytes);
+                btrfs_mark_buffer_dirty(leaf);
+                ret = btrfs_inc_extent_ref(trans, root,
+                                        new_extent->disk_bytenr,
+                                        new_extent->disk_num_bytes,
+                                        leaf->start,
+                                        root->root_key.objectid,
+                                        trans->transid, key.objectid);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, root,
+                                        bytenr, num_bytes, leaf->start,
+                                        btrfs_header_owner(leaf),
+                                        btrfs_header_generation(leaf),
+                                        key.objectid, 0);
+                BUG_ON(ret);
+                cond_resched();
+        }
+        kfree(new_extent);
+        BUG_ON(ext_index + 1 != ref->nritems);
+        btrfs_free_leaf_ref(root, ref);
+        return 0;
+}
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        int ret;
+        if (root->reloc_root) {
+                reloc_root = root->reloc_root;
+                root->reloc_root = NULL;
+                list_add(&reloc_root->dead_list,
+                         &root->fs_info->dead_reloc_roots);
+                btrfs_set_root_bytenr(&reloc_root->root_item,
+                                      reloc_root->node->start);
+                btrfs_set_root_level(&root->root_item,
+                                     btrfs_header_level(reloc_root->node));
+                memset(&reloc_root->root_item.drop_progress, 0,
+                        sizeof(struct btrfs_disk_key));
+                reloc_root->root_item.drop_level = 0;
+                ret = btrfs_update_root(trans, root->fs_info->tree_root,
+                                        &reloc_root->root_key,
+                                        &reloc_root->root_item);
+                BUG_ON(ret);
+        }
+        return 0;
+}
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root *prev_root = NULL;
+        struct list_head dead_roots;
+        int ret;
+        unsigned long nr;
+        INIT_LIST_HEAD(&dead_roots);
+        list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
+        while (!list_empty(&dead_roots)) {
+                reloc_root = list_entry(dead_roots.prev,
+                                        struct btrfs_root, dead_list);
+                list_del_init(&reloc_root->dead_list);
+                BUG_ON(reloc_root->commit_root != NULL);
+                while (1) {
+                        trans = btrfs_join_transaction(root, 1);
+                        BUG_ON(!trans);
+                        mutex_lock(&root->fs_info->drop_mutex);
+                        ret = btrfs_drop_snapshot(trans, reloc_root);
+                        if (ret != -EAGAIN)
+                                break;
+                        mutex_unlock(&root->fs_info->drop_mutex);
+                        nr = trans->blocks_used;
+                        ret = btrfs_end_transaction(trans, root);
+                        BUG_ON(ret);
+                        btrfs_btree_balance_dirty(root, nr);
+                }
+                free_extent_buffer(reloc_root->node);
+                ret = btrfs_del_root(trans, root->fs_info->tree_root,
+                                     &reloc_root->root_key);
+                BUG_ON(ret);
+                mutex_unlock(&root->fs_info->drop_mutex);
+                nr = trans->blocks_used;
+                ret = btrfs_end_transaction(trans, root);
+                BUG_ON(ret);
+                btrfs_btree_balance_dirty(root, nr);
+                kfree(prev_root);
+                prev_root = reloc_root;
+        }
+        if (prev_root) {
+                btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
+                kfree(prev_root);
+        }
+        return 0;
+}
+int btrfs_add_dead_reloc_root(struct btrfs_root *root)
+{
+        list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
+        return 0;
+}
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key location;
+        int found;
+        int ret;
+        mutex_lock(&root->fs_info->tree_reloc_mutex);
+        ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
+        BUG_ON(ret);
+        found = !list_empty(&root->fs_info->dead_reloc_roots);
+        mutex_unlock(&root->fs_info->tree_reloc_mutex);
+        if (found) {
+                trans = btrfs_start_transaction(root, 1);
+                BUG_ON(!trans);
+                ret = btrfs_commit_transaction(trans, root);
+                BUG_ON(ret);
+        }
+        location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+        location.offset = (u64)-1;
+        location.type = BTRFS_ROOT_ITEM_KEY;
+        reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+        BUG_ON(!reloc_root);
+        btrfs_orphan_cleanup(reloc_root);
+        return 0;
+}
+static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct extent_buffer *eb;
+        struct btrfs_root_item *root_item;
+        struct btrfs_key root_key;
+        int ret;
+        BUG_ON(!root->ref_cows);
+        if (root->reloc_root)
+                return 0;
+        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+        BUG_ON(!root_item);
+        ret = btrfs_copy_root(trans, root, root->commit_root,
+                              &eb, BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(ret);
+        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+        root_key.offset = root->root_key.objectid;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        memcpy(root_item, &root->root_item, sizeof(root_item));
+        btrfs_set_root_refs(root_item, 0);
+        btrfs_set_root_bytenr(root_item, eb->start);
+        btrfs_set_root_level(root_item, btrfs_header_level(eb));
+        btrfs_set_root_generation(root_item, trans->transid);
+        btrfs_tree_unlock(eb);
+        free_extent_buffer(eb);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+                                &root_key, root_item);
+        BUG_ON(ret);
+        kfree(root_item);
+        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+                                                 &root_key);
+        BUG_ON(!reloc_root);
+        reloc_root->last_trans = trans->transid;
+        reloc_root->commit_root = NULL;
+        reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
+        root->reloc_root = reloc_root;
+        return 0;
+}
+/*
+ * Core function of space balance.
+ *
+ * The idea is using reloc trees to relocate tree blocks in reference
+ * counted roots. There is one reloc tree for each subvol, and all
+ * reloc trees share same root key objectid. Reloc trees are snapshots
+ * of the latest committed roots of subvols (root->commit_root).
+ *
+ * To relocate a tree block referenced by a subvol, there are two steps.
+ * COW the block through subvol's reloc tree, then update block pointer
+ * in the subvol to point to the new block. Since all reloc trees share
+ * same root key objectid, doing special handing for tree blocks owned
+ * by them is easy. Once a tree block has been COWed in one reloc tree,
+ * we can use the resulting new block directly when the same block is
+ * required to COW again through other reloc trees. By this way, relocated
+ * tree blocks are shared between reloc trees, so they are also shared
+ * between subvols.
+ */
+static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct btrfs_key *first_key,
+                                      struct btrfs_ref_path *ref_path,
+                                      struct btrfs_block_group_cache *group,
+                                      struct inode *reloc_inode)
+{
+        struct btrfs_root *reloc_root;
+        struct extent_buffer *eb = NULL;
+        struct btrfs_key *keys;
+        u64 *nodes;
+        int level;
+        int shared_level;
+        int lowest_level = 0;
+        int ret;
+        if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+                lowest_level = ref_path->owner_objectid;
+        if (!root->ref_cows) {
+                path->lowest_level = lowest_level;
+                ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
+                BUG_ON(ret < 0);
+                path->lowest_level = 0;
+                btrfs_release_path(root, path);
+                return 0;
+        }
+        mutex_lock(&root->fs_info->tree_reloc_mutex);
+        ret = init_reloc_tree(trans, root);
+        BUG_ON(ret);
+        reloc_root = root->reloc_root;
+        shared_level = ref_path->shared_level;
+        ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
+        keys = ref_path->node_keys;
+        nodes = ref_path->new_nodes;
+        memset(&keys[shared_level + 1], 0,
+               sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
+        memset(&nodes[shared_level + 1], 0,
+               sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
+        if (nodes[lowest_level] == 0) {
+                path->lowest_level = lowest_level;
+                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+                                        0, 1);
+                BUG_ON(ret);
+                for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
+                        eb = path->nodes[level];
+                        if (!eb || eb == reloc_root->node)
+                                break;
+                        nodes[level] = eb->start;
+                        if (level == 0)
+                                btrfs_item_key_to_cpu(eb, &keys[level], 0);
+                        else
+                                btrfs_node_key_to_cpu(eb, &keys[level], 0);
+                }
+                if (nodes[0] &&
+                    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                        eb = path->nodes[0];
+                        ret = replace_extents_in_leaf(trans, reloc_root, eb,
+                                                      group, reloc_inode);
+                        BUG_ON(ret);
+                }
+                btrfs_release_path(reloc_root, path);
+        } else {
+                ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
+                                       lowest_level);
+                BUG_ON(ret);
+        }
+        /*
+         * replace tree blocks in the fs tree with tree blocks in
+         * the reloc tree.
+         */
+        ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
+        BUG_ON(ret < 0);
+        if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+                                        0, 0);
+                BUG_ON(ret);
+                extent_buffer_get(path->nodes[0]);
+                eb = path->nodes[0];
+                btrfs_release_path(reloc_root, path);
+                ret = invalidate_extent_cache(reloc_root, eb, group, root);
+                BUG_ON(ret);
+                free_extent_buffer(eb);
+        }
+        mutex_unlock(&root->fs_info->tree_reloc_mutex);
+        path->lowest_level = 0;
+        return 0;
+}
+static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_path *path,
+                                        struct btrfs_key *first_key,
+                                        struct btrfs_ref_path *ref_path)
+{
+        int ret;
+        ret = relocate_one_path(trans, root, path, first_key,
+                                ref_path, NULL, NULL);
+        BUG_ON(ret);
+        if (root == root->fs_info->extent_root)
+                btrfs_extent_post_op(trans, root);
+        return 0;
+}
+static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *extent_root,
+                                    struct btrfs_path *path,
+                                    struct btrfs_key *extent_key)
+{
+        int ret;
+        ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+        if (ret)
+                goto out;
+        ret = btrfs_del_item(trans, extent_root, path);
+out:
+        btrfs_release_path(extent_root, path);
+        return ret;
+}
+static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
+                                                struct btrfs_ref_path *ref_path)
+{
+        struct btrfs_key root_key;
+        root_key.objectid = ref_path->root_objectid;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        if (is_cowonly_root(ref_path->root_objectid))
+                root_key.offset = 0;
+        else
+                root_key.offset = (u64)-1;
+        return btrfs_read_fs_root_no_name(fs_info, &root_key);
+}
+static noinline int relocate_one_extent(struct btrfs_root *extent_root,
+                                        struct btrfs_path *path,
+                                        struct btrfs_key *extent_key,
+                                        struct btrfs_block_group_cache *group,
+                                        struct inode *reloc_inode, int pass)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *found_root;
+        struct btrfs_ref_path *ref_path = NULL;
+        struct disk_extent *new_extents = NULL;
+        int nr_extents = 0;
+        int loops;
+        int ret;
+        int level;
+        struct btrfs_key first_key;
+        u64 prev_block = 0;
+        trans = btrfs_start_transaction(extent_root, 1);
+        BUG_ON(!trans);
+        if (extent_key->objectid == 0) {
+                ret = del_extent_zero(trans, extent_root, path, extent_key);
+                goto out;
+        }
+        ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
+        if (!ref_path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        for (loops = 0; ; loops++) {
+                if (loops == 0) {
+                        ret = btrfs_first_ref_path(trans, extent_root, ref_path,
+                                                   extent_key->objectid);
+                } else {
+                        ret = btrfs_next_ref_path(trans, extent_root, ref_path);
+                }
+                if (ret < 0)
+                        goto out;
+                if (ret > 0)
+                        break;
+                if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+                    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+                        continue;
+                found_root = read_ref_root(extent_root->fs_info, ref_path);
+                BUG_ON(!found_root);
+                /*
+                 * for reference counted tree, only process reference paths
+                 * rooted at the latest committed root.
+                 */
+                if (found_root->ref_cows &&
+                    ref_path->root_generation != found_root->root_key.offset)
+                        continue;
+                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                        if (pass == 0) {
+                                /*
+                                 * copy data extents to new locations
+                                 */
+                                u64 group_start = group->key.objectid;
+                                ret = relocate_data_extent(reloc_inode,
+                                                           extent_key,
+                                                           group_start);
+                                if (ret < 0)
+                                        goto out;
+                                break;
+                        }
+                        level = 0;
+                } else {
+                        level = ref_path->owner_objectid;
+                }
+                if (prev_block != ref_path->nodes[level]) {
+                        struct extent_buffer *eb;
+                        u64 block_start = ref_path->nodes[level];
+                        u64 block_size = btrfs_level_size(found_root, level);
+                        eb = read_tree_block(found_root, block_start,
+                                             block_size, 0);
+                        btrfs_tree_lock(eb);
+                        BUG_ON(level != btrfs_header_level(eb));
+                        if (level == 0)
+                                btrfs_item_key_to_cpu(eb, &first_key, 0);
+                        else
+                                btrfs_node_key_to_cpu(eb, &first_key, 0);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
+                        prev_block = block_start;
+                }
+                btrfs_record_root_in_trans(found_root);
+                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                        /*
+                         * try to update data extent references while
+                         * keeping metadata shared between snapshots.
+                         */
+                        if (pass == 1) {
+                                ret = relocate_one_path(trans, found_root,
+                                                path, &first_key, ref_path,
+                                                group, reloc_inode);
+                                if (ret < 0)
+                                        goto out;
+                                continue;
+                        }
+                        /*
+                         * use fallback method to process the remaining
+                         * references.
+                         */
+                        if (!new_extents) {
+                                u64 group_start = group->key.objectid;
+                                new_extents = kmalloc(sizeof(*new_extents),
+                                                      GFP_NOFS);
+                                nr_extents = 1;
+                                ret = get_new_locations(reloc_inode,
+                                                        extent_key,
+                                                        group_start, 1,
+                                                        &new_extents,
+                                                        &nr_extents);
+                                if (ret)
+                                        goto out;
+                        }
+                        ret = replace_one_extent(trans, found_root,
+                                                path, extent_key,
+                                                &first_key, ref_path,
+                                                new_extents, nr_extents);
+                } else {
+                        ret = relocate_tree_block(trans, found_root, path,
+                                                  &first_key, ref_path);
+                }
+                if (ret < 0)
+                        goto out;
+        }
+        ret = 0;
+out:
+        btrfs_end_transaction(trans, extent_root);
+        kfree(new_extents);
+        kfree(ref_path);
+        return ret;
+}
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+        u64 num_devices;
+        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+        num_devices = root->fs_info->fs_devices->rw_devices;
+        if (num_devices == 1) {
+                stripped |= BTRFS_BLOCK_GROUP_DUP;
+                stripped = flags & ~stripped;
+                /* turn raid0 into single device chunks */
+                if (flags & BTRFS_BLOCK_GROUP_RAID0)
+                        return stripped;
+                /* turn mirroring into duplication */
+                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                             BTRFS_BLOCK_GROUP_RAID10))
+                        return stripped | BTRFS_BLOCK_GROUP_DUP;
+                return flags;
+        } else {
+                /* they already had raid on here, just return */
+                if (flags & stripped)
+                        return flags;
+                stripped |= BTRFS_BLOCK_GROUP_DUP;
+                stripped = flags & ~stripped;
+                /* switch duplicated blocks with raid1 */
+                if (flags & BTRFS_BLOCK_GROUP_DUP)
+                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
+                /* turn single device chunks into raid0 */
+                return stripped | BTRFS_BLOCK_GROUP_RAID0;
+        }
+        return flags;
+}
+static int __alloc_chunk_for_shrink(struct btrfs_root *root,
+                     struct btrfs_block_group_cache *shrink_block_group,
+                     int force)
+{
+        struct btrfs_trans_handle *trans;
+        u64 new_alloc_flags;
+        u64 calc;
+        spin_lock(&shrink_block_group->lock);
+        if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+                spin_unlock(&shrink_block_group->lock);
+                trans = btrfs_start_transaction(root, 1);
+                spin_lock(&shrink_block_group->lock);
+                new_alloc_flags = update_block_group_flags(root,
+                                                   shrink_block_group->flags);
+                if (new_alloc_flags != shrink_block_group->flags) {
+                        calc =
+                             btrfs_block_group_used(&shrink_block_group->item);
+                } else {
+                        calc = shrink_block_group->key.offset;
+                }
+                spin_unlock(&shrink_block_group->lock);
+                do_chunk_alloc(trans, root->fs_info->extent_root,
+                               calc + 2 * 1024 * 1024, new_alloc_flags, force);
+                btrfs_end_transaction(trans, root);
+        } else
+                spin_unlock(&shrink_block_group->lock);
+        return 0;
+}
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 u64 objectid, u64 size)
+{
+        struct btrfs_path *path;
+        struct btrfs_inode_item *item;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+        if (ret)
+                goto out;
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+        btrfs_set_inode_generation(leaf, item, 1);
+        btrfs_set_inode_size(leaf, item, size);
+        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_block_group_cache *group)
+{
+        struct inode *inode = NULL;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        struct btrfs_key root_key;
+        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+        int err = 0;
+        root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
+        root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+        if (IS_ERR(root))
+                return ERR_CAST(root);
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+        if (err)
+                goto out;
+        err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+        BUG_ON(err);
+        err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+                                       group->key.offset, 0, group->key.offset,
+                                       0, 0, 0);
+        BUG_ON(err);
+        inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+        if (inode->i_state & I_NEW) {
+                BTRFS_I(inode)->root = root;
+                BTRFS_I(inode)->location.objectid = objectid;
+                BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+                BTRFS_I(inode)->location.offset = 0;
+                btrfs_read_locked_inode(inode);
+                unlock_new_inode(inode);
+                BUG_ON(is_bad_inode(inode));
+        } else {
+                BUG_ON(1);
+        }
+        BTRFS_I(inode)->index_cnt = group->key.objectid;
+        err = btrfs_orphan_add(trans, inode);
+out:
+        btrfs_end_transaction(trans, root);
+        if (err) {
+                if (inode)
+                        iput(inode);
+                inode = ERR_PTR(err);
+        }
+        return inode;
+}
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_ordered_extent *ordered;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct list_head list;
+        size_t offset;
+        int ret;
+        u64 disk_bytenr;
+        INIT_LIST_HEAD(&list);
+        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+                                       disk_bytenr + len - 1, &list);
+        while (!list_empty(&list)) {
+                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+                list_del_init(&sums->list);
+                sector_sum = sums->sums;
+                sums->bytenr = ordered->start;
+                offset = 0;
+                while (offset < sums->len) {
+                        sector_sum->bytenr += ordered->start - disk_bytenr;
+                        sector_sum++;
+                        offset += root->sectorsize;
+                }
+                btrfs_add_ordered_sum(inode, ordered, sums);
+        }
+        btrfs_put_ordered_extent(ordered);
+        return 0;
+}
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        struct btrfs_fs_info *info = root->fs_info;
+        struct extent_buffer *leaf;
+        struct inode *reloc_inode;
+        struct btrfs_block_group_cache *block_group;
+        struct btrfs_key key;
+        u64 skipped;
+        u64 cur_byte;
+        u64 total_found;
+        u32 nritems;
+        int ret;
+        int progress;
+        int pass = 0;
+        root = root->fs_info->extent_root;
+        block_group = btrfs_lookup_block_group(info, group_start);
+        BUG_ON(!block_group);
+        printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
+               (unsigned long long)block_group->key.objectid,
+               (unsigned long long)block_group->flags);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        reloc_inode = create_reloc_inode(info, block_group);
+        BUG_ON(IS_ERR(reloc_inode));
+        __alloc_chunk_for_shrink(root, block_group, 1);
+        set_block_group_readonly(block_group);
+        btrfs_start_delalloc_inodes(info->tree_root);
+        btrfs_wait_ordered_extents(info->tree_root, 0);
+again:
+        skipped = 0;
+        total_found = 0;
+        progress = 0;
+        key.objectid = block_group->key.objectid;
+        key.offset = 0;
+        key.type = 0;
+        cur_byte = key.objectid;
+        trans = btrfs_start_transaction(info->tree_root, 1);
+        btrfs_commit_transaction(trans, info->tree_root);
+        mutex_lock(&root->fs_info->cleaner_mutex);
+        btrfs_clean_old_snapshots(info->tree_root);
+        btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
+        mutex_unlock(&root->fs_info->cleaner_mutex);
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+next:
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret == 1) {
+                                ret = 0;
+                                break;
+                        }
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid >= block_group->key.objectid +
+                    block_group->key.offset)
+                        break;
+                if (progress && need_resched()) {
+                        btrfs_release_path(root, path);
+                        cond_resched();
+                        progress = 0;
+                        continue;
+                }
+                progress = 1;
+                if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
+                    key.objectid + key.offset <= cur_byte) {
+                        path->slots[0]++;
+                        goto next;
+                }
+                total_found++;
+                cur_byte = key.objectid + key.offset;
+                btrfs_release_path(root, path);
+                __alloc_chunk_for_shrink(root, block_group, 0);
+                ret = relocate_one_extent(root, path, &key, block_group,
+                                          reloc_inode, pass);
+                BUG_ON(ret < 0);
+                if (ret > 0)
+                        skipped++;
+                key.objectid = cur_byte;
+                key.type = 0;
+                key.offset = 0;
+        }
+        btrfs_release_path(root, path);
+        if (pass == 0) {
+                btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
+                invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
+        }
+        if (total_found > 0) {
+                printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
+                       (unsigned long long)total_found, pass);
+                pass++;
+                if (total_found == skipped && pass > 2) {
+                        iput(reloc_inode);
+                        reloc_inode = create_reloc_inode(info, block_group);
+                        pass = 0;
+                }
+                goto again;
+        }
+        /* delete reloc_inode */
+        iput(reloc_inode);
+        /* unpin extents in this range */
+        trans = btrfs_start_transaction(info->tree_root, 1);
+        btrfs_commit_transaction(trans, info->tree_root);
+        spin_lock(&block_group->lock);
+        WARN_ON(block_group->pinned > 0);
+        WARN_ON(block_group->reserved > 0);
+        WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
+        spin_unlock(&block_group->lock);
+        put_block_group(block_group);
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int find_first_block_group(struct btrfs_root *root,
+                struct btrfs_path *path, struct btrfs_key *key)
+{
+        int ret = 0;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        int slot;
+        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        while (1) {
+                slot = path->slots[0];
+                leaf = path->nodes[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto out;
+                        break;
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (found_key.objectid >= key->objectid &&
+                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+                        ret = 0;
+                        goto out;
+                }
+                path->slots[0]++;
+        }
+        ret = -ENOENT;
+out:
+        return ret;
+}
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+        struct btrfs_block_group_cache *block_group;
+        struct rb_node *n;
+        spin_lock(&info->block_group_cache_lock);
+        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+                block_group = rb_entry(n, struct btrfs_block_group_cache,
+                                       cache_node);
+                rb_erase(&block_group->cache_node,
+                         &info->block_group_cache_tree);
+                spin_unlock(&info->block_group_cache_lock);
+                btrfs_remove_free_space_cache(block_group);
+                down_write(&block_group->space_info->groups_sem);
+                list_del(&block_group->list);
+                up_write(&block_group->space_info->groups_sem);
+                WARN_ON(atomic_read(&block_group->count) != 1);
+                kfree(block_group);
+                spin_lock(&info->block_group_cache_lock);
+        }
+        spin_unlock(&info->block_group_cache_lock);
+        return 0;
+}
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_block_group_cache *cache;
+        struct btrfs_fs_info *info = root->fs_info;
+        struct btrfs_space_info *space_info;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        root = info->extent_root;
+        key.objectid = 0;
+        key.offset = 0;
+        btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        while (1) {
+                ret = find_first_block_group(root, path, &key);
+                if (ret > 0) {
+                        ret = 0;
+                        goto error;
+                }
+                if (ret != 0)
+                        goto error;
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                cache = kzalloc(sizeof(*cache), GFP_NOFS);
+                if (!cache) {
+                        ret = -ENOMEM;
+                        break;
+                }
+                atomic_set(&cache->count, 1);
+                spin_lock_init(&cache->lock);
+                mutex_init(&cache->alloc_mutex);
+                mutex_init(&cache->cache_mutex);
+                INIT_LIST_HEAD(&cache->list);
+                read_extent_buffer(leaf, &cache->item,
+                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
+                                   sizeof(cache->item));
+                memcpy(&cache->key, &found_key, sizeof(found_key));
+                key.objectid = found_key.objectid + found_key.offset;
+                btrfs_release_path(root, path);
+                cache->flags = btrfs_block_group_flags(&cache->item);
+                ret = update_space_info(info, cache->flags, found_key.offset,
+                                        btrfs_block_group_used(&cache->item),
+                                        &space_info);
+                BUG_ON(ret);
+                cache->space_info = space_info;
+                down_write(&space_info->groups_sem);
+                list_add_tail(&cache->list, &space_info->block_groups);
+                up_write(&space_info->groups_sem);
+                ret = btrfs_add_block_group_cache(root->fs_info, cache);
+                BUG_ON(ret);
+                set_avail_alloc_bits(root->fs_info, cache->flags);
+                if (btrfs_chunk_readonly(root, cache->key.objectid))
+                        set_block_group_readonly(cache);
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 bytes_used,
+                           u64 type, u64 chunk_objectid, u64 chunk_offset,
+                           u64 size)
+{
+        int ret;
+        struct btrfs_root *extent_root;
+        struct btrfs_block_group_cache *cache;
+        extent_root = root->fs_info->extent_root;
+        root->fs_info->last_trans_new_blockgroup = trans->transid;
+        cache = kzalloc(sizeof(*cache), GFP_NOFS);
+        if (!cache)
+                return -ENOMEM;
+        cache->key.objectid = chunk_offset;
+        cache->key.offset = size;
+        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+        atomic_set(&cache->count, 1);
+        spin_lock_init(&cache->lock);
+        mutex_init(&cache->alloc_mutex);
+        mutex_init(&cache->cache_mutex);
+        INIT_LIST_HEAD(&cache->list);
+        btrfs_set_block_group_used(&cache->item, bytes_used);
+        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+        cache->flags = type;
+        btrfs_set_block_group_flags(&cache->item, type);
+        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+                                &cache->space_info);
+        BUG_ON(ret);
+        down_write(&cache->space_info->groups_sem);
+        list_add_tail(&cache->list, &cache->space_info->block_groups);
+        up_write(&cache->space_info->groups_sem);
+        ret = btrfs_add_block_group_cache(root->fs_info, cache);
+        BUG_ON(ret);
+        ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
+                                sizeof(cache->item));
+        BUG_ON(ret);
+        finish_current_insert(trans, extent_root, 0);
+        ret = del_pending_extents(trans, extent_root, 0);
+        BUG_ON(ret);
+        set_avail_alloc_bits(extent_root->fs_info, type);
+        return 0;
+}
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 group_start)
+{
+        struct btrfs_path *path;
+        struct btrfs_block_group_cache *block_group;
+        struct btrfs_key key;
+        int ret;
+        root = root->fs_info->extent_root;
+        block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+        BUG_ON(!block_group);
+        BUG_ON(!block_group->ro);
+        memcpy(&key, &block_group->key, sizeof(key));
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        btrfs_remove_free_space_cache(block_group);
+        rb_erase(&block_group->cache_node,
+                 &root->fs_info->block_group_cache_tree);
+        down_write(&block_group->space_info->groups_sem);
+        list_del(&block_group->list);
+        up_write(&block_group->space_info->groups_sem);
+        spin_lock(&block_group->space_info->lock);
+        block_group->space_info->total_bytes -= block_group->key.offset;
+        block_group->space_info->bytes_readonly -= block_group->key.offset;
+        spin_unlock(&block_group->space_info->lock);
+        block_group->space_info->full = 0;
+        put_block_group(block_group);
+        put_block_group(block_group);
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0)
+                ret = -EIO;
+        if (ret < 0)
+                goto out;
+        ret = btrfs_del_item(trans, root, path);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 000000000000..e086d407f1fa
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3717 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/version.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "extent_io.h"
+#include "extent_map.h"
+#include "compat.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+                                       unsigned long extra_flags,
+                                       void (*ctor)(void *, struct kmem_cache *,
+                                                    unsigned long));
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+#define LEAK_DEBUG 0
+#ifdef LEAK_DEBUG
+static DEFINE_SPINLOCK(leak_lock);
+#endif
+#define BUFFER_LRU_MAX 64
+struct tree_entry {
+        u64 start;
+        u64 end;
+        struct rb_node rb_node;
+};
+struct extent_page_data {
+        struct bio *bio;
+        struct extent_io_tree *tree;
+        get_extent_t *get_extent;
+        /* tells writepage not to lock the state bits for this range
+         * it still does the unlocking
+         */
+        int extent_locked;
+};
+int __init extent_io_init(void)
+{
+        extent_state_cache = btrfs_cache_create("extent_state",
+                                            sizeof(struct extent_state), 0,
+                                            NULL);
+        if (!extent_state_cache)
+                return -ENOMEM;
+        extent_buffer_cache = btrfs_cache_create("extent_buffers",
+                                            sizeof(struct extent_buffer), 0,
+                                            NULL);
+        if (!extent_buffer_cache)
+                goto free_state_cache;
+        return 0;
+free_state_cache:
+        kmem_cache_destroy(extent_state_cache);
+        return -ENOMEM;
+}
+void extent_io_exit(void)
+{
+        struct extent_state *state;
+        struct extent_buffer *eb;
+        while (!list_empty(&states)) {
+                state = list_entry(states.next, struct extent_state, leak_list);
+                printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+                       "state %lu in tree %p refs %d\n",
+                       (unsigned long long)state->start,
+                       (unsigned long long)state->end,
+                       state->state, state->tree, atomic_read(&state->refs));
+                list_del(&state->leak_list);
+                kmem_cache_free(extent_state_cache, state);
+        }
+        while (!list_empty(&buffers)) {
+                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+                printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+                       "refs %d\n", (unsigned long long)eb->start,
+                       eb->len, atomic_read(&eb->refs));
+                list_del(&eb->leak_list);
+                kmem_cache_free(extent_buffer_cache, eb);
+        }
+        if (extent_state_cache)
+                kmem_cache_destroy(extent_state_cache);
+        if (extent_buffer_cache)
+                kmem_cache_destroy(extent_buffer_cache);
+}
+void extent_io_tree_init(struct extent_io_tree *tree,
+                          struct address_space *mapping, gfp_t mask)
+{
+        tree->state.rb_node = NULL;
+        tree->buffer.rb_node = NULL;
+        tree->ops = NULL;
+        tree->dirty_bytes = 0;
+        spin_lock_init(&tree->lock);
+        spin_lock_init(&tree->buffer_lock);
+        tree->mapping = mapping;
+}
+static struct extent_state *alloc_extent_state(gfp_t mask)
+{
+        struct extent_state *state;
+#ifdef LEAK_DEBUG
+        unsigned long flags;
+#endif
+        state = kmem_cache_alloc(extent_state_cache, mask);
+        if (!state)
+                return state;
+        state->state = 0;
+        state->private = 0;
+        state->tree = NULL;
+#ifdef LEAK_DEBUG
+        spin_lock_irqsave(&leak_lock, flags);
+        list_add(&state->leak_list, &states);
+        spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+        atomic_set(&state->refs, 1);
+        init_waitqueue_head(&state->wq);
+        return state;
+}
+static void free_extent_state(struct extent_state *state)
+{
+        if (!state)
+                return;
+        if (atomic_dec_and_test(&state->refs)) {
+#ifdef LEAK_DEBUG
+                unsigned long flags;
+#endif
+                WARN_ON(state->tree);
+#ifdef LEAK_DEBUG
+                spin_lock_irqsave(&leak_lock, flags);
+                list_del(&state->leak_list);
+                spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+                kmem_cache_free(extent_state_cache, state);
+        }
+}
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct tree_entry *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct tree_entry, rb_node);
+                if (offset < entry->start)
+                        p = &(*p)->rb_left;
+                else if (offset > entry->end)
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        entry = rb_entry(node, struct tree_entry, rb_node);
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
+                                     struct rb_node **prev_ret,
+                                     struct rb_node **next_ret)
+{
+        struct rb_root *root = &tree->state;
+        struct rb_node *n = root->rb_node;
+        struct rb_node *prev = NULL;
+        struct rb_node *orig_prev = NULL;
+        struct tree_entry *entry;
+        struct tree_entry *prev_entry = NULL;
+        while (n) {
+                entry = rb_entry(n, struct tree_entry, rb_node);
+                prev = n;
+                prev_entry = entry;
+                if (offset < entry->start)
+                        n = n->rb_left;
+                else if (offset > entry->end)
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        if (prev_ret) {
+                orig_prev = prev;
+                while (prev && offset > prev_entry->end) {
+                        prev = rb_next(prev);
+                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+                }
+                *prev_ret = prev;
+                prev = orig_prev;
+        }
+        if (next_ret) {
+                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+                while (prev && offset < prev_entry->start) {
+                        prev = rb_prev(prev);
+                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+                }
+                *next_ret = prev;
+        }
+        return NULL;
+}
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+                                          u64 offset)
+{
+        struct rb_node *prev = NULL;
+        struct rb_node *ret;
+        ret = __etree_search(tree, offset, &prev, NULL);
+        if (!ret)
+                return prev;
+        return ret;
+}
+static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
+                                          u64 offset, struct rb_node *node)
+{
+        struct rb_root *root = &tree->buffer;
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct extent_buffer *eb;
+        while (*p) {
+                parent = *p;
+                eb = rb_entry(parent, struct extent_buffer, rb_node);
+                if (offset < eb->start)
+                        p = &(*p)->rb_left;
+                else if (offset > eb->start)
+                        p = &(*p)->rb_right;
+                else
+                        return eb;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+                                           u64 offset)
+{
+        struct rb_root *root = &tree->buffer;
+        struct rb_node *n = root->rb_node;
+        struct extent_buffer *eb;
+        while (n) {
+                eb = rb_entry(n, struct extent_buffer, rb_node);
+                if (offset < eb->start)
+                        n = n->rb_left;
+                else if (offset > eb->start)
+                        n = n->rb_right;
+                else
+                        return eb;
+        }
+        return NULL;
+}
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree.  Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_io_tree *tree,
+                       struct extent_state *state)
+{
+        struct extent_state *other;
+        struct rb_node *other_node;
+        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
+                return 0;
+        other_node = rb_prev(&state->rb_node);
+        if (other_node) {
+                other = rb_entry(other_node, struct extent_state, rb_node);
+                if (other->end == state->start - 1 &&
+                    other->state == state->state) {
+                        state->start = other->start;
+                        other->tree = NULL;
+                        rb_erase(&other->rb_node, &tree->state);
+                        free_extent_state(other);
+                }
+        }
+        other_node = rb_next(&state->rb_node);
+        if (other_node) {
+                other = rb_entry(other_node, struct extent_state, rb_node);
+                if (other->start == state->end + 1 &&
+                    other->state == state->state) {
+                        other->start = state->start;
+                        state->tree = NULL;
+                        rb_erase(&state->rb_node, &tree->state);
+                        free_extent_state(state);
+                }
+        }
+        return 0;
+}
+static void set_state_cb(struct extent_io_tree *tree,
+                         struct extent_state *state,
+                         unsigned long bits)
+{
+        if (tree->ops && tree->ops->set_bit_hook) {
+                tree->ops->set_bit_hook(tree->mapping->host, state->start,
+                                        state->end, state->state, bits);
+        }
+}
+static void clear_state_cb(struct extent_io_tree *tree,
+                           struct extent_state *state,
+                           unsigned long bits)
+{
+        if (tree->ops && tree->ops->clear_bit_hook) {
+                tree->ops->clear_bit_hook(tree->mapping->host, state->start,
+                                          state->end, state->state, bits);
+        }
+}
+/*
+ * insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+                        struct extent_state *state, u64 start, u64 end,
+                        int bits)
+{
+        struct rb_node *node;
+        if (end < start) {
+                printk(KERN_ERR "btrfs end < start %llu %llu\n",
+                       (unsigned long long)end,
+                       (unsigned long long)start);
+                WARN_ON(1);
+        }
+        if (bits & EXTENT_DIRTY)
+                tree->dirty_bytes += end - start + 1;
+        set_state_cb(tree, state, bits);
+        state->state |= bits;
+        state->start = start;
+        state->end = end;
+        node = tree_insert(&tree->state, end, &state->rb_node);
+        if (node) {
+                struct extent_state *found;
+                found = rb_entry(node, struct extent_state, rb_node);
+                printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+                       "%llu %llu\n", (unsigned long long)found->start,
+                       (unsigned long long)found->end,
+                       (unsigned long long)start, (unsigned long long)end);
+                free_extent_state(state);
+                return -EEXIST;
+        }
+        state->tree = tree;
+        merge_state(tree, state);
+        return 0;
+}
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+                       struct extent_state *prealloc, u64 split)
+{
+        struct rb_node *node;
+        prealloc->start = orig->start;
+        prealloc->end = split - 1;
+        prealloc->state = orig->state;
+        orig->start = split;
+        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+        if (node) {
+                struct extent_state *found;
+                found = rb_entry(node, struct extent_state, rb_node);
+                free_extent_state(prealloc);
+                return -EEXIST;
+        }
+        prealloc->tree = tree;
+        return 0;
+}
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_io_tree *tree,
+                            struct extent_state *state, int bits, int wake,
+                            int delete)
+{
+        int ret = state->state & bits;
+        if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+                u64 range = state->end - state->start + 1;
+                WARN_ON(range > tree->dirty_bytes);
+                tree->dirty_bytes -= range;
+        }
+        clear_state_cb(tree, state, bits);
+        state->state &= ~bits;
+        if (wake)
+                wake_up(&state->wq);
+        if (delete || state->state == 0) {
+                if (state->tree) {
+                        clear_state_cb(tree, state, state->state);
+                        rb_erase(&state->rb_node, &tree->state);
+                        state->tree = NULL;
+                        free_extent_state(state);
+                } else {
+                        WARN_ON(1);
+                }
+        } else {
+                merge_state(tree, state);
+        }
+        return ret;
+}
+/*
+ * clear some bits on a range in the tree.  This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                     int bits, int wake, int delete, gfp_t mask)
+{
+        struct extent_state *state;
+        struct extent_state *prealloc = NULL;
+        struct rb_node *node;
+        int err;
+        int set = 0;
+again:
+        if (!prealloc && (mask & __GFP_WAIT)) {
+                prealloc = alloc_extent_state(mask);
+                if (!prealloc)
+                        return -ENOMEM;
+        }
+        spin_lock(&tree->lock);
+        /*
+         * this search will find the extents that end after
+         * our range starts
+         */
+        node = tree_search(tree, start);
+        if (!node)
+                goto out;
+        state = rb_entry(node, struct extent_state, rb_node);
+        if (state->start > end)
+                goto out;
+        WARN_ON(state->end < start);
+        /*
+         *     | ---- desired range ---- |
+         *  | state | or
+         *  | ------------- state -------------- |
+         *
+         * We need to split the extent we found, and may flip
+         * bits on second half.
+         *
+         * If the extent we found extends past our range, we
+         * just split and search again.  It'll get split again
+         * the next time though.
+         *
+         * If the extent we found is inside our range, we clear
+         * the desired bit on it.
+         */
+        if (state->start < start) {
+                if (!prealloc)
+                        prealloc = alloc_extent_state(GFP_ATOMIC);
+                err = split_state(tree, state, prealloc, start);
+                BUG_ON(err == -EEXIST);
+                prealloc = NULL;
+                if (err)
+                        goto out;
+                if (state->end <= end) {
+                        start = state->end + 1;
+                        set |= clear_state_bit(tree, state, bits,
+                                        wake, delete);
+                } else {
+                        start = state->start;
+                }
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *                        | state |
+         * We need to split the extent, and clear the bit
+         * on the first half
+         */
+        if (state->start <= end && state->end > end) {
+                if (!prealloc)
+                        prealloc = alloc_extent_state(GFP_ATOMIC);
+                err = split_state(tree, state, prealloc, end + 1);
+                BUG_ON(err == -EEXIST);
+                if (wake)
+                        wake_up(&state->wq);
+                set |= clear_state_bit(tree, prealloc, bits,
+                                       wake, delete);
+                prealloc = NULL;
+                goto out;
+        }
+        start = state->end + 1;
+        set |= clear_state_bit(tree, state, bits, wake, delete);
+        goto search_again;
+out:
+        spin_unlock(&tree->lock);
+        if (prealloc)
+                free_extent_state(prealloc);
+        return set;
+search_again:
+        if (start > end)
+                goto out;
+        spin_unlock(&tree->lock);
+        if (mask & __GFP_WAIT)
+                cond_resched();
+        goto again;
+}
+static int wait_on_state(struct extent_io_tree *tree,
+                         struct extent_state *state)
+                __releases(tree->lock)
+                __acquires(tree->lock)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+        spin_unlock(&tree->lock);
+        schedule();
+        spin_lock(&tree->lock);
+        finish_wait(&state->wq, &wait);
+        return 0;
+}
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+{
+        struct extent_state *state;
+        struct rb_node *node;
+        spin_lock(&tree->lock);
+again:
+        while (1) {
+                /*
+                 * this search will find all the extents that end after
+                 * our range starts
+                 */
+                node = tree_search(tree, start);
+                if (!node)
+                        break;
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (state->start > end)
+                        goto out;
+                if (state->state & bits) {
+                        start = state->start;
+                        atomic_inc(&state->refs);
+                        wait_on_state(tree, state);
+                        free_extent_state(state);
+                        goto again;
+                }
+                start = state->end + 1;
+                if (start > end)
+                        break;
+                if (need_resched()) {
+                        spin_unlock(&tree->lock);
+                        cond_resched();
+                        spin_lock(&tree->lock);
+                }
+        }
+out:
+        spin_unlock(&tree->lock);
+        return 0;
+}
+static void set_state_bits(struct extent_io_tree *tree,
+                           struct extent_state *state,
+                           int bits)
+{
+        if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+                u64 range = state->end - state->start + 1;
+                tree->dirty_bytes += range;
+        }
+        set_state_cb(tree, state, bits);
+        state->state |= bits;
+}
+/*
+ * set some bits on a range in the tree.  This may require allocations
+ * or sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+ * range already has the desired bits set.  The start of the existing
+ * range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive
+ * This takes the tree lock.
+ */
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                          int bits, int exclusive, u64 *failed_start,
+                          gfp_t mask)
+{
+        struct extent_state *state;
+        struct extent_state *prealloc = NULL;
+        struct rb_node *node;
+        int err = 0;
+        int set;
+        u64 last_start;
+        u64 last_end;
+again:
+        if (!prealloc && (mask & __GFP_WAIT)) {
+                prealloc = alloc_extent_state(mask);
+                if (!prealloc)
+                        return -ENOMEM;
+        }
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node) {
+                err = insert_state(tree, prealloc, start, end, bits);
+                prealloc = NULL;
+                BUG_ON(err == -EEXIST);
+                goto out;
+        }
+        state = rb_entry(node, struct extent_state, rb_node);
+        last_start = state->start;
+        last_end = state->end;
+        /*
+         * | ---- desired range ---- |
+         * | state |
+         *
+         * Just lock what we found and keep going
+         */
+        if (state->start == start && state->end <= end) {
+                set = state->state & bits;
+                if (set && exclusive) {
+                        *failed_start = state->start;
+                        err = -EEXIST;
+                        goto out;
+                }
+                set_state_bits(tree, state, bits);
+                start = state->end + 1;
+                merge_state(tree, state);
+                goto search_again;
+        }
+        /*
+         *     | ---- desired range ---- |
+         * | state |
+         *   or
+         * | ------------- state -------------- |
+         *
+         * We need to split the extent we found, and may flip bits on
+         * second half.
+         *
+         * If the extent we found extends past our
+         * range, we just split and search again.  It'll get split
+         * again the next time though.
+         *
+         * If the extent we found is inside our range, we set the
+         * desired bit on it.
+         */
+        if (state->start < start) {
+                set = state->state & bits;
+                if (exclusive && set) {
+                        *failed_start = start;
+                        err = -EEXIST;
+                        goto out;
+                }
+                err = split_state(tree, state, prealloc, start);
+                BUG_ON(err == -EEXIST);
+                prealloc = NULL;
+                if (err)
+                        goto out;
+                if (state->end <= end) {
+                        set_state_bits(tree, state, bits);
+                        start = state->end + 1;
+                        merge_state(tree, state);
+                } else {
+                        start = state->start;
+                }
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *     | state | or               | state |
+         *
+         * There's a hole, we need to insert something in it and
+         * ignore the extent we found.
+         */
+        if (state->start > start) {
+                u64 this_end;
+                if (end < last_start)
+                        this_end = end;
+                else
+                        this_end = last_start - 1;
+                err = insert_state(tree, prealloc, start, this_end,
+                                   bits);
+                prealloc = NULL;
+                BUG_ON(err == -EEXIST);
+                if (err)
+                        goto out;
+                start = this_end + 1;
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *                        | state |
+         * We need to split the extent, and set the bit
+         * on the first half
+         */
+        if (state->start <= end && state->end > end) {
+                set = state->state & bits;
+                if (exclusive && set) {
+                        *failed_start = start;
+                        err = -EEXIST;
+                        goto out;
+                }
+                err = split_state(tree, state, prealloc, end + 1);
+                BUG_ON(err == -EEXIST);
+                set_state_bits(tree, prealloc, bits);
+                merge_state(tree, prealloc);
+                prealloc = NULL;
+                goto out;
+        }
+        goto search_again;
+out:
+        spin_unlock(&tree->lock);
+        if (prealloc)
+                free_extent_state(prealloc);
+        return err;
+search_again:
+        if (start > end)
+                goto out;
+        spin_unlock(&tree->lock);
+        if (mask & __GFP_WAIT)
+                cond_resched();
+        goto again;
+}
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+                              mask);
+}
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+}
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                    int bits, gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, bits, 0, NULL,
+                              mask);
+}
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                      int bits, gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+}
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask)
+{
+        return set_extent_bit(tree, start, end,
+                              EXTENT_DELALLOC | EXTENT_DIRTY,
+                              0, NULL, mask);
+}
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end,
+                                EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
+}
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                         gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+}
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+                              mask);
+}
+static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+}
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                        gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+                              mask);
+}
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+                                 u64 end, gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+}
+static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+                         gfp_t mask)
+{
+        return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+                              0, NULL, mask);
+}
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+                                  u64 end, gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+}
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+        int err;
+        u64 failed_start;
+        while (1) {
+                err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+                                     &failed_start, mask);
+                if (err == -EEXIST && (mask & __GFP_WAIT)) {
+                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+                        start = failed_start;
+                } else {
+                        break;
+                }
+                WARN_ON(start > end);
+        }
+        return err;
+}
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+                    gfp_t mask)
+{
+        int err;
+        u64 failed_start;
+        err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+                             &failed_start, mask);
+        if (err == -EEXIST) {
+                if (failed_start > start)
+                        clear_extent_bit(tree, start, failed_start - 1,
+                                         EXTENT_LOCKED, 1, 0, mask);
+                return 0;
+        }
+        return 1;
+}
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+                  gfp_t mask)
+{
+        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+}
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        while (index <= end_index) {
+                page = find_get_page(tree->mapping, index);
+                BUG_ON(!page);
+                __set_page_dirty_nobuffers(page);
+                page_cache_release(page);
+                index++;
+        }
+        set_extent_dirty(tree, start, end, GFP_NOFS);
+        return 0;
+}
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        while (index <= end_index) {
+                page = find_get_page(tree->mapping, index);
+                BUG_ON(!page);
+                set_page_writeback(page);
+                page_cache_release(page);
+                index++;
+        }
+        set_extent_writeback(tree, start, end, GFP_NOFS);
+        return 0;
+}
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+                          u64 *start_ret, u64 *end_ret, int bits)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        int ret = 1;
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node)
+                goto out;
+        while (1) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (state->end >= start && (state->state & bits)) {
+                        *start_ret = state->start;
+                        *end_ret = state->end;
+                        ret = 0;
+                        break;
+                }
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        spin_unlock(&tree->lock);
+        return ret;
+}
+/* find the first state struct with 'bits' set after 'start', and
+ * return it.  tree->lock must be held.  NULL will returned if
+ * nothing was found after 'start'
+ */
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+                                                 u64 start, int bits)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node)
+                goto out;
+        while (1) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (state->end >= start && (state->state & bits))
+                        return state;
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        return NULL;
+}
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+                                        u64 *start, u64 *end, u64 max_bytes)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        u64 cur_start = *start;
+        u64 found = 0;
+        u64 total_bytes = 0;
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, cur_start);
+        if (!node) {
+                if (!found)
+                        *end = (u64)-1;
+                goto out;
+        }
+        while (1) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (found && (state->start != cur_start ||
+                              (state->state & EXTENT_BOUNDARY))) {
+                        goto out;
+                }
+                if (!(state->state & EXTENT_DELALLOC)) {
+                        if (!found)
+                                *end = state->end;
+                        goto out;
+                }
+                if (!found)
+                        *start = state->start;
+                found++;
+                *end = state->end;
+                cur_start = state->end + 1;
+                node = rb_next(node);
+                if (!node)
+                        break;
+                total_bytes += state->end - state->start + 1;
+                if (total_bytes >= max_bytes)
+                        break;
+        }
+out:
+        spin_unlock(&tree->lock);
+        return found;
+}
+static noinline int __unlock_for_delalloc(struct inode *inode,
+                                          struct page *locked_page,
+                                          u64 start, u64 end)
+{
+        int ret;
+        struct page *pages[16];
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        unsigned long nr_pages = end_index - index + 1;
+        int i;
+        if (index == locked_page->index && end_index == index)
+                return 0;
+        while (nr_pages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min_t(unsigned long, nr_pages,
+                                     ARRAY_SIZE(pages)), pages);
+                for (i = 0; i < ret; i++) {
+                        if (pages[i] != locked_page)
+                                unlock_page(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                nr_pages -= ret;
+                index += ret;
+                cond_resched();
+        }
+        return 0;
+}
+static noinline int lock_delalloc_pages(struct inode *inode,
+                                        struct page *locked_page,
+                                        u64 delalloc_start,
+                                        u64 delalloc_end)
+{
+        unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+        unsigned long start_index = index;
+        unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+        unsigned long pages_locked = 0;
+        struct page *pages[16];
+        unsigned long nrpages;
+        int ret;
+        int i;
+        /* the caller is responsible for locking the start index */
+        if (index == locked_page->index && index == end_index)
+                return 0;
+        /* skip the page at the start index */
+        nrpages = end_index - index + 1;
+        while (nrpages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min_t(unsigned long,
+                                     nrpages, ARRAY_SIZE(pages)), pages);
+                if (ret == 0) {
+                        ret = -EAGAIN;
+                        goto done;
+                }
+                /* now we have an array of pages, lock them all */
+                for (i = 0; i < ret; i++) {
+                        /*
+                         * the caller is taking responsibility for
+                         * locked_page
+                         */
+                        if (pages[i] != locked_page) {
+                                lock_page(pages[i]);
+                                if (!PageDirty(pages[i]) ||
+                                    pages[i]->mapping != inode->i_mapping) {
+                                        ret = -EAGAIN;
+                                        unlock_page(pages[i]);
+                                        page_cache_release(pages[i]);
+                                        goto done;
+                                }
+                        }
+                        page_cache_release(pages[i]);
+                        pages_locked++;
+                }
+                nrpages -= ret;
+                index += ret;
+                cond_resched();
+        }
+        ret = 0;
+done:
+        if (ret && pages_locked) {
+                __unlock_for_delalloc(inode, locked_page,
+                              delalloc_start,
+                              ((u64)(start_index + pages_locked - 1)) <<
+                              PAGE_CACHE_SHIFT);
+        }
+        return ret;
+}
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+                                             struct extent_io_tree *tree,
+                                             struct page *locked_page,
+                                             u64 *start, u64 *end,
+                                             u64 max_bytes)
+{
+        u64 delalloc_start;
+        u64 delalloc_end;
+        u64 found;
+        int ret;
+        int loops = 0;
+again:
+        /* step one, find a bunch of delalloc bytes starting at start */
+        delalloc_start = *start;
+        delalloc_end = 0;
+        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+                                    max_bytes);
+        if (!found || delalloc_end <= *start) {
+                *start = delalloc_start;
+                *end = delalloc_end;
+                return found;
+        }
+        /*
+         * start comes from the offset of locked_page.  We have to lock
+         * pages in order, so we can't process delalloc bytes before
+         * locked_page
+         */
+        if (delalloc_start < *start)
+                delalloc_start = *start;
+        /*
+         * make sure to limit the number of pages we try to lock down
+         * if we're looping.
+         */
+        if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
+                delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+        /* step two, lock all the pages after the page that has start */
+        ret = lock_delalloc_pages(inode, locked_page,
+                                  delalloc_start, delalloc_end);
+        if (ret == -EAGAIN) {
+                /* some of the pages are gone, lets avoid looping by
+                 * shortening the size of the delalloc range we're searching
+                 */
+                if (!loops) {
+                        unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+                        max_bytes = PAGE_CACHE_SIZE - offset;
+                        loops = 1;
+                        goto again;
+                } else {
+                        found = 0;
+                        goto out_failed;
+                }
+        }
+        BUG_ON(ret);
+        /* step three, lock the state bits for the whole range */
+        lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+        /* then test to make sure it is all still delalloc */
+        ret = test_range_bit(tree, delalloc_start, delalloc_end,
+                             EXTENT_DELALLOC, 1);
+        if (!ret) {
+                unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+                __unlock_for_delalloc(inode, locked_page,
+                              delalloc_start, delalloc_end);
+                cond_resched();
+                goto again;
+        }
+        *start = delalloc_start;
+        *end = delalloc_end;
+out_failed:
+        return found;
+}
+int extent_clear_unlock_delalloc(struct inode *inode,
+                                struct extent_io_tree *tree,
+                                u64 start, u64 end, struct page *locked_page,
+                                int unlock_pages,
+                                int clear_unlock,
+                                int clear_delalloc, int clear_dirty,
+                                int set_writeback,
+                                int end_writeback)
+{
+        int ret;
+        struct page *pages[16];
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        unsigned long nr_pages = end_index - index + 1;
+        int i;
+        int clear_bits = 0;
+        if (clear_unlock)
+                clear_bits |= EXTENT_LOCKED;
+        if (clear_dirty)
+                clear_bits |= EXTENT_DIRTY;
+        if (clear_delalloc)
+                clear_bits |= EXTENT_DELALLOC;
+        clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+        if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+                return 0;
+        while (nr_pages > 0) {
+                ret = find_get_pages_contig(inode->i_mapping, index,
+                                     min_t(unsigned long,
+                                     nr_pages, ARRAY_SIZE(pages)), pages);
+                for (i = 0; i < ret; i++) {
+                        if (pages[i] == locked_page) {
+                                page_cache_release(pages[i]);
+                                continue;
+                        }
+                        if (clear_dirty)
+                                clear_page_dirty_for_io(pages[i]);
+                        if (set_writeback)
+                                set_page_writeback(pages[i]);
+                        if (end_writeback)
+                                end_page_writeback(pages[i]);
+                        if (unlock_pages)
+                                unlock_page(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                nr_pages -= ret;
+                index += ret;
+                cond_resched();
+        }
+        return 0;
+}
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set.  This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached.  The total number found is returned.
+ */
+u64 count_range_bits(struct extent_io_tree *tree,
+                     u64 *start, u64 search_end, u64 max_bytes,
+                     unsigned long bits)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        u64 cur_start = *start;
+        u64 total_bytes = 0;
+        int found = 0;
+        if (search_end <= cur_start) {
+                WARN_ON(1);
+                return 0;
+        }
+        spin_lock(&tree->lock);
+        if (cur_start == 0 && bits == EXTENT_DIRTY) {
+                total_bytes = tree->dirty_bytes;
+                goto out;
+        }
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, cur_start);
+        if (!node)
+                goto out;
+        while (1) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (state->start > search_end)
+                        break;
+                if (state->end >= cur_start && (state->state & bits)) {
+                        total_bytes += min(search_end, state->end) + 1 -
+                                       max(cur_start, state->start);
+                        if (total_bytes >= max_bytes)
+                                break;
+                        if (!found) {
+                                *start = state->start;
+                                found = 1;
+                        }
+                }
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        spin_unlock(&tree->lock);
+        return total_bytes;
+}
+#if 0
+/*
+ * helper function to lock both pages and extents in the tree.
+ * pages must be locked first.
+ */
+static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        int err;
+        while (index <= end_index) {
+                page = grab_cache_page(tree->mapping, index);
+                if (!page) {
+                        err = -ENOMEM;
+                        goto failed;
+                }
+                if (IS_ERR(page)) {
+                        err = PTR_ERR(page);
+                        goto failed;
+                }
+                index++;
+        }
+        lock_extent(tree, start, end, GFP_NOFS);
+        return 0;
+failed:
+        /*
+         * we failed above in getting the page at 'index', so we undo here
+         * up to but not including the page at 'index'
+         */
+        end_index = index;
+        index = start >> PAGE_CACHE_SHIFT;
+        while (index < end_index) {
+                page = find_get_page(tree->mapping, index);
+                unlock_page(page);
+                page_cache_release(page);
+                index++;
+        }
+        return err;
+}
+/*
+ * helper function to unlock both pages and extents in the tree.
+ */
+static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        while (index <= end_index) {
+                page = find_get_page(tree->mapping, index);
+                unlock_page(page);
+                page_cache_release(page);
+                index++;
+        }
+        unlock_extent(tree, start, end, GFP_NOFS);
+        return 0;
+}
+#endif
+/*
+ * set the private field for a given byte offset in the tree.  If there isn't
+ * an extent_state there already, this does nothing.
+ */
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        int ret = 0;
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node) {
+                ret = -ENOENT;
+                goto out;
+        }
+        state = rb_entry(node, struct extent_state, rb_node);
+        if (state->start != start) {
+                ret = -ENOENT;
+                goto out;
+        }
+        state->private = private;
+out:
+        spin_unlock(&tree->lock);
+        return ret;
+}
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+        struct rb_node *node;
+        struct extent_state *state;
+        int ret = 0;
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node) {
+                ret = -ENOENT;
+                goto out;
+        }
+        state = rb_entry(node, struct extent_state, rb_node);
+        if (state->start != start) {
+                ret = -ENOENT;
+                goto out;
+        }
+        *private = state->private;
+out:
+        spin_unlock(&tree->lock);
+        return ret;
+}
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if every extent in the tree
+ * has the bits set.  Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                   int bits, int filled)
+{
+        struct extent_state *state = NULL;
+        struct rb_node *node;
+        int bitset = 0;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, start);
+        while (node && start <= end) {
+                state = rb_entry(node, struct extent_state, rb_node);
+                if (filled && state->start > start) {
+                        bitset = 0;
+                        break;
+                }
+                if (state->start > end)
+                        break;
+                if (state->state & bits) {
+                        bitset = 1;
+                        if (!filled)
+                                break;
+                } else if (filled) {
+                        bitset = 0;
+                        break;
+                }
+                start = state->end + 1;
+                if (start > end)
+                        break;
+                node = rb_next(node);
+                if (!node) {
+                        if (filled)
+                                bitset = 0;
+                        break;
+                }
+        }
+        spin_unlock(&tree->lock);
+        return bitset;
+}
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_io_tree *tree,
+                               struct page *page)
+{
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+                SetPageUptodate(page);
+        return 0;
+}
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_io_tree *tree,
+                             struct page *page)
+{
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+                unlock_page(page);
+        return 0;
+}
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_io_tree *tree,
+                             struct page *page)
+{
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+                end_page_writeback(page);
+        return 0;
+}
+/* lots and lots of room for performance fixes in the end_bio funcs */
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_writepage(struct bio *bio, int err)
+{
+        int uptodate = err == 0;
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct extent_io_tree *tree;
+        u64 start;
+        u64 end;
+        int whole_page;
+        int ret;
+        do {
+                struct page *page = bvec->bv_page;
+                tree = &BTRFS_I(page->mapping->host)->io_tree;
+                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+                         bvec->bv_offset;
+                end = start + bvec->bv_len - 1;
+                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+                        whole_page = 1;
+                else
+                        whole_page = 0;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (tree->ops && tree->ops->writepage_end_io_hook) {
+                        ret = tree->ops->writepage_end_io_hook(page, start,
+                                                       end, NULL, uptodate);
+                        if (ret)
+                                uptodate = 0;
+                }
+                if (!uptodate && tree->ops &&
+                    tree->ops->writepage_io_failed_hook) {
+                        ret = tree->ops->writepage_io_failed_hook(bio, page,
+                                                         start, end, NULL);
+                        if (ret == 0) {
+                                uptodate = (err == 0);
+                                continue;
+                        }
+                }
+                if (!uptodate) {
+                        clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+                        ClearPageUptodate(page);
+                        SetPageError(page);
+                }
+                clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+                if (whole_page)
+                        end_page_writeback(page);
+                else
+                        check_page_writeback(tree, page);
+        } while (bvec >= bio->bi_io_vec);
+        bio_put(bio);
+}
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static void end_bio_extent_readpage(struct bio *bio, int err)
+{
+        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct extent_io_tree *tree;
+        u64 start;
+        u64 end;
+        int whole_page;
+        int ret;
+        if (err)
+                uptodate = 0;
+        do {
+                struct page *page = bvec->bv_page;
+                tree = &BTRFS_I(page->mapping->host)->io_tree;
+                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+                        bvec->bv_offset;
+                end = start + bvec->bv_len - 1;
+                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+                        whole_page = 1;
+                else
+                        whole_page = 0;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+                        ret = tree->ops->readpage_end_io_hook(page, start, end,
+                                                              NULL);
+                        if (ret)
+                                uptodate = 0;
+                }
+                if (!uptodate && tree->ops &&
+                    tree->ops->readpage_io_failed_hook) {
+                        ret = tree->ops->readpage_io_failed_hook(bio, page,
+                                                         start, end, NULL);
+                        if (ret == 0) {
+                                uptodate =
+                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
+                                if (err)
+                                        uptodate = 0;
+                                continue;
+                        }
+                }
+                if (uptodate) {
+                        set_extent_uptodate(tree, start, end,
+                                            GFP_ATOMIC);
+                }
+                unlock_extent(tree, start, end, GFP_ATOMIC);
+                if (whole_page) {
+                        if (uptodate) {
+                                SetPageUptodate(page);
+                        } else {
+                                ClearPageUptodate(page);
+                                SetPageError(page);
+                        }
+                        unlock_page(page);
+                } else {
+                        if (uptodate) {
+                                check_page_uptodate(tree, page);
+                        } else {
+                                ClearPageUptodate(page);
+                                SetPageError(page);
+                        }
+                        check_page_locked(tree, page);
+                }
+        } while (bvec >= bio->bi_io_vec);
+        bio_put(bio);
+}
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+static void end_bio_extent_preparewrite(struct bio *bio, int err)
+{
+        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct extent_io_tree *tree;
+        u64 start;
+        u64 end;
+        do {
+                struct page *page = bvec->bv_page;
+                tree = &BTRFS_I(page->mapping->host)->io_tree;
+                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+                        bvec->bv_offset;
+                end = start + bvec->bv_len - 1;
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
+                if (uptodate) {
+                        set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+                } else {
+                        ClearPageUptodate(page);
+                        SetPageError(page);
+                }
+                unlock_extent(tree, start, end, GFP_ATOMIC);
+        } while (bvec >= bio->bi_io_vec);
+        bio_put(bio);
+}
+static struct bio *
+extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+                 gfp_t gfp_flags)
+{
+        struct bio *bio;
+        bio = bio_alloc(gfp_flags, nr_vecs);
+        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+                while (!bio && (nr_vecs /= 2))
+                        bio = bio_alloc(gfp_flags, nr_vecs);
+        }
+        if (bio) {
+                bio->bi_size = 0;
+                bio->bi_bdev = bdev;
+                bio->bi_sector = first_sector;
+        }
+        return bio;
+}
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+                          unsigned long bio_flags)
+{
+        int ret = 0;
+        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct page *page = bvec->bv_page;
+        struct extent_io_tree *tree = bio->bi_private;
+        u64 start;
+        u64 end;
+        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+        end = start + bvec->bv_len - 1;
+        bio->bi_private = NULL;
+        bio_get(bio);
+        if (tree->ops && tree->ops->submit_bio_hook)
+                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+                                           mirror_num, bio_flags);
+        else
+                submit_bio(rw, bio);
+        if (bio_flagged(bio, BIO_EOPNOTSUPP))
+                ret = -EOPNOTSUPP;
+        bio_put(bio);
+        return ret;
+}
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+                              struct page *page, sector_t sector,
+                              size_t size, unsigned long offset,
+                              struct block_device *bdev,
+                              struct bio **bio_ret,
+                              unsigned long max_pages,
+                              bio_end_io_t end_io_func,
+                              int mirror_num,
+                              unsigned long prev_bio_flags,
+                              unsigned long bio_flags)
+{
+        int ret = 0;
+        struct bio *bio;
+        int nr;
+        int contig = 0;
+        int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+        int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+        size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+        if (bio_ret && *bio_ret) {
+                bio = *bio_ret;
+                if (old_compressed)
+                        contig = bio->bi_sector == sector;
+                else
+                        contig = bio->bi_sector + (bio->bi_size >> 9) ==
+                                sector;
+                if (prev_bio_flags != bio_flags || !contig ||
+                    (tree->ops && tree->ops->merge_bio_hook &&
+                     tree->ops->merge_bio_hook(page, offset, page_size, bio,
+                                               bio_flags)) ||
+                    bio_add_page(bio, page, page_size, offset) < page_size) {
+                        ret = submit_one_bio(rw, bio, mirror_num,
+                                             prev_bio_flags);
+                        bio = NULL;
+                } else {
+                        return 0;
+                }
+        }
+        if (this_compressed)
+                nr = BIO_MAX_PAGES;
+        else
+                nr = bio_get_nr_vecs(bdev);
+        bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+        bio_add_page(bio, page, page_size, offset);
+        bio->bi_end_io = end_io_func;
+        bio->bi_private = tree;
+        if (bio_ret)
+                *bio_ret = bio;
+        else
+                ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
+        return ret;
+}
+void set_page_extent_mapped(struct page *page)
+{
+        if (!PagePrivate(page)) {
+                SetPagePrivate(page);
+                page_cache_get(page);
+                set_page_private(page, EXTENT_PAGE_PRIVATE);
+        }
+}
+static void set_page_extent_head(struct page *page, unsigned long len)
+{
+        set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+}
+/*
+ * basic readpage implementation.  Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+static int __extent_read_full_page(struct extent_io_tree *tree,
+                                   struct page *page,
+                                   get_extent_t *get_extent,
+                                   struct bio **bio, int mirror_num,
+                                   unsigned long *bio_flags)
+{
+        struct inode *inode = page->mapping->host;
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 page_end = start + PAGE_CACHE_SIZE - 1;
+        u64 end;
+        u64 cur = start;
+        u64 extent_offset;
+        u64 last_byte = i_size_read(inode);
+        u64 block_start;
+        u64 cur_end;
+        sector_t sector;
+        struct extent_map *em;
+        struct block_device *bdev;
+        int ret;
+        int nr = 0;
+        size_t page_offset = 0;
+        size_t iosize;
+        size_t disk_io_size;
+        size_t blocksize = inode->i_sb->s_blocksize;
+        unsigned long this_bio_flag = 0;
+        set_page_extent_mapped(page);
+        end = page_end;
+        lock_extent(tree, start, end, GFP_NOFS);
+        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+                char *userpage;
+                size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+                if (zero_offset) {
+                        iosize = PAGE_CACHE_SIZE - zero_offset;
+                        userpage = kmap_atomic(page, KM_USER0);
+                        memset(userpage + zero_offset, 0, iosize);
+                        flush_dcache_page(page);
+                        kunmap_atomic(userpage, KM_USER0);
+                }
+        }
+        while (cur <= end) {
+                if (cur >= last_byte) {
+                        char *userpage;
+                        iosize = PAGE_CACHE_SIZE - page_offset;
+                        userpage = kmap_atomic(page, KM_USER0);
+                        memset(userpage + page_offset, 0, iosize);
+                        flush_dcache_page(page);
+                        kunmap_atomic(userpage, KM_USER0);
+                        set_extent_uptodate(tree, cur, cur + iosize - 1,
+                                            GFP_NOFS);
+                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        break;
+                }
+                em = get_extent(inode, page, page_offset, cur,
+                                end - cur + 1, 0);
+                if (IS_ERR(em) || !em) {
+                        SetPageError(page);
+                        unlock_extent(tree, cur, end, GFP_NOFS);
+                        break;
+                }
+                extent_offset = cur - em->start;
+                BUG_ON(extent_map_end(em) <= cur);
+                BUG_ON(end < cur);
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                        this_bio_flag = EXTENT_BIO_COMPRESSED;
+                iosize = min(extent_map_end(em) - cur, end - cur + 1);
+                cur_end = min(extent_map_end(em) - 1, end);
+                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+                        disk_io_size = em->block_len;
+                        sector = em->block_start >> 9;
+                } else {
+                        sector = (em->block_start + extent_offset) >> 9;
+                        disk_io_size = iosize;
+                }
+                bdev = em->bdev;
+                block_start = em->block_start;
+                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                        block_start = EXTENT_MAP_HOLE;
+                free_extent_map(em);
+                em = NULL;
+                /* we've found a hole, just zero and go on */
+                if (block_start == EXTENT_MAP_HOLE) {
+                        char *userpage;
+                        userpage = kmap_atomic(page, KM_USER0);
+                        memset(userpage + page_offset, 0, iosize);
+                        flush_dcache_page(page);
+                        kunmap_atomic(userpage, KM_USER0);
+                        set_extent_uptodate(tree, cur, cur + iosize - 1,
+                                            GFP_NOFS);
+                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        cur = cur + iosize;
+                        page_offset += iosize;
+                        continue;
+                }
+                /* the get_extent function already copied into the page */
+                if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+                        check_page_uptodate(tree, page);
+                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        cur = cur + iosize;
+                        page_offset += iosize;
+                        continue;
+                }
+                /* we have an inline extent but it didn't get marked up
+                 * to date.  Error out
+                 */
+                if (block_start == EXTENT_MAP_INLINE) {
+                        SetPageError(page);
+                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        cur = cur + iosize;
+                        page_offset += iosize;
+                        continue;
+                }
+                ret = 0;
+                if (tree->ops && tree->ops->readpage_io_hook) {
+                        ret = tree->ops->readpage_io_hook(page, cur,
+                                                          cur + iosize - 1);
+                }
+                if (!ret) {
+                        unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+                        pnr -= page->index;
+                        ret = submit_extent_page(READ, tree, page,
+                                         sector, disk_io_size, page_offset,
+                                         bdev, bio, pnr,
+                                         end_bio_extent_readpage, mirror_num,
+                                         *bio_flags,
+                                         this_bio_flag);
+                        nr++;
+                        *bio_flags = this_bio_flag;
+                }
+                if (ret)
+                        SetPageError(page);
+                cur = cur + iosize;
+                page_offset += iosize;
+        }
+        if (!nr) {
+                if (!PageError(page))
+                        SetPageUptodate(page);
+                unlock_page(page);
+        }
+        return 0;
+}
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+                            get_extent_t *get_extent)
+{
+        struct bio *bio = NULL;
+        unsigned long bio_flags = 0;
+        int ret;
+        ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+                                      &bio_flags);
+        if (bio)
+                submit_one_bio(READ, bio, 0, bio_flags);
+        return ret;
+}
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+                              void *data)
+{
+        struct inode *inode = page->mapping->host;
+        struct extent_page_data *epd = data;
+        struct extent_io_tree *tree = epd->tree;
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 delalloc_start;
+        u64 page_end = start + PAGE_CACHE_SIZE - 1;
+        u64 end;
+        u64 cur = start;
+        u64 extent_offset;
+        u64 last_byte = i_size_read(inode);
+        u64 block_start;
+        u64 iosize;
+        u64 unlock_start;
+        sector_t sector;
+        struct extent_map *em;
+        struct block_device *bdev;
+        int ret;
+        int nr = 0;
+        size_t pg_offset = 0;
+        size_t blocksize;
+        loff_t i_size = i_size_read(inode);
+        unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+        u64 nr_delalloc;
+        u64 delalloc_end;
+        int page_started;
+        int compressed;
+        unsigned long nr_written = 0;
+        WARN_ON(!PageLocked(page));
+        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+        if (page->index > end_index ||
+           (page->index == end_index && !pg_offset)) {
+                page->mapping->a_ops->invalidatepage(page, 0);
+                unlock_page(page);
+                return 0;
+        }
+        if (page->index == end_index) {
+                char *userpage;
+                userpage = kmap_atomic(page, KM_USER0);
+                memset(userpage + pg_offset, 0,
+                       PAGE_CACHE_SIZE - pg_offset);
+                kunmap_atomic(userpage, KM_USER0);
+                flush_dcache_page(page);
+        }
+        pg_offset = 0;
+        set_page_extent_mapped(page);
+        delalloc_start = start;
+        delalloc_end = 0;
+        page_started = 0;
+        if (!epd->extent_locked) {
+                while (delalloc_end < page_end) {
+                        nr_delalloc = find_lock_delalloc_range(inode, tree,
+                                                       page,
+                                                       &delalloc_start,
+                                                       &delalloc_end,
+                                                       128 * 1024 * 1024);
+                        if (nr_delalloc == 0) {
+                                delalloc_start = delalloc_end + 1;
+                                continue;
+                        }
+                        tree->ops->fill_delalloc(inode, page, delalloc_start,
+                                                 delalloc_end, &page_started,
+                                                 &nr_written);
+                        delalloc_start = delalloc_end + 1;
+                }
+                /* did the fill delalloc function already unlock and start
+                 * the IO?
+                 */
+                if (page_started) {
+                        ret = 0;
+                        goto update_nr_written;
+                }
+        }
+        lock_extent(tree, start, page_end, GFP_NOFS);
+        unlock_start = start;
+        if (tree->ops && tree->ops->writepage_start_hook) {
+                ret = tree->ops->writepage_start_hook(page, start,
+                                                      page_end);
+                if (ret == -EAGAIN) {
+                        unlock_extent(tree, start, page_end, GFP_NOFS);
+                        redirty_page_for_writepage(wbc, page);
+                        unlock_page(page);
+                        ret = 0;
+                        goto update_nr_written;
+                }
+        }
+        nr_written++;
+        end = page_end;
+        if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+                printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
+        if (last_byte <= start) {
+                clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+                unlock_extent(tree, start, page_end, GFP_NOFS);
+                if (tree->ops && tree->ops->writepage_end_io_hook)
+                        tree->ops->writepage_end_io_hook(page, start,
+                                                         page_end, NULL, 1);
+                unlock_start = page_end + 1;
+                goto done;
+        }
+        set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+        blocksize = inode->i_sb->s_blocksize;
+        while (cur <= end) {
+                if (cur >= last_byte) {
+                        clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+                        unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+                        if (tree->ops && tree->ops->writepage_end_io_hook)
+                                tree->ops->writepage_end_io_hook(page, cur,
+                                                         page_end, NULL, 1);
+                        unlock_start = page_end + 1;
+                        break;
+                }
+                em = epd->get_extent(inode, page, pg_offset, cur,
+                                     end - cur + 1, 1);
+                if (IS_ERR(em) || !em) {
+                        SetPageError(page);
+                        break;
+                }
+                extent_offset = cur - em->start;
+                BUG_ON(extent_map_end(em) <= cur);
+                BUG_ON(end < cur);
+                iosize = min(extent_map_end(em) - cur, end - cur + 1);
+                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+                sector = (em->block_start + extent_offset) >> 9;
+                bdev = em->bdev;
+                block_start = em->block_start;
+                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                free_extent_map(em);
+                em = NULL;
+                /*
+                 * compressed and inline extents are written through other
+                 * paths in the FS
+                 */
+                if (compressed || block_start == EXTENT_MAP_HOLE ||
+                    block_start == EXTENT_MAP_INLINE) {
+                        clear_extent_dirty(tree, cur,
+                                           cur + iosize - 1, GFP_NOFS);
+                        unlock_extent(tree, unlock_start, cur + iosize - 1,
+                                      GFP_NOFS);
+                        /*
+                         * end_io notification does not happen here for
+                         * compressed extents
+                         */
+                        if (!compressed && tree->ops &&
+                            tree->ops->writepage_end_io_hook)
+                                tree->ops->writepage_end_io_hook(page, cur,
+                                                         cur + iosize - 1,
+                                                         NULL, 1);
+                        else if (compressed) {
+                                /* we don't want to end_page_writeback on
+                                 * a compressed extent.  this happens
+                                 * elsewhere
+                                 */
+                                nr++;
+                        }
+                        cur += iosize;
+                        pg_offset += iosize;
+                        unlock_start = cur;
+                        continue;
+                }
+                /* leave this out until we have a page_mkwrite call */
+                if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+                                   EXTENT_DIRTY, 0)) {
+                        cur = cur + iosize;
+                        pg_offset += iosize;
+                        continue;
+                }
+                clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+                if (tree->ops && tree->ops->writepage_io_hook) {
+                        ret = tree->ops->writepage_io_hook(page, cur,
+                                                cur + iosize - 1);
+                } else {
+                        ret = 0;
+                }
+                if (ret) {
+                        SetPageError(page);
+                } else {
+                        unsigned long max_nr = end_index + 1;
+                        set_range_writeback(tree, cur, cur + iosize - 1);
+                        if (!PageWriteback(page)) {
+                                printk(KERN_ERR "btrfs warning page %lu not "
+                                       "writeback, cur %llu end %llu\n",
+                                       page->index, (unsigned long long)cur,
+                                       (unsigned long long)end);
+                        }
+                        ret = submit_extent_page(WRITE, tree, page, sector,
+                                                 iosize, pg_offset, bdev,
+                                                 &epd->bio, max_nr,
+                                                 end_bio_extent_writepage,
+                                                 0, 0, 0);
+                        if (ret)
+                                SetPageError(page);
+                }
+                cur = cur + iosize;
+                pg_offset += iosize;
+                nr++;
+        }
+done:
+        if (nr == 0) {
+                /* make sure the mapping tag for page dirty gets cleared */
+                set_page_writeback(page);
+                end_page_writeback(page);
+        }
+        if (unlock_start <= page_end)
+                unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+        unlock_page(page);
+update_nr_written:
+        wbc->nr_to_write -= nr_written;
+        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+                page->mapping->writeback_index = page->index + nr_written;
+        return 0;
+}
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int extent_write_cache_pages(struct extent_io_tree *tree,
+                             struct address_space *mapping,
+                             struct writeback_control *wbc,
+                             writepage_t writepage, void *data,
+                             void (*flush_fn)(void *))
+{
+        struct backing_dev_info *bdi = mapping->backing_dev_info;
+        int ret = 0;
+        int done = 0;
+        struct pagevec pvec;
+        int nr_pages;
+        pgoff_t index;
+        pgoff_t end;            /* Inclusive */
+        int scanned = 0;
+        int range_whole = 0;
+        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                wbc->encountered_congestion = 1;
+                return 0;
+        }
+        pagevec_init(&pvec, 0);
+        if (wbc->range_cyclic) {
+                index = mapping->writeback_index; /* Start from prev offset */
+                end = -1;
+        } else {
+                index = wbc->range_start >> PAGE_CACHE_SHIFT;
+                end = wbc->range_end >> PAGE_CACHE_SHIFT;
+                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+                        range_whole = 1;
+                scanned = 1;
+        }
+retry:
+        while (!done && (index <= end) &&
+               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                              PAGECACHE_TAG_DIRTY, min(end - index,
+                                  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+                unsigned i;
+                scanned = 1;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /*
+                         * At this point we hold neither mapping->tree_lock nor
+                         * lock on the page itself: the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or even
+                         * swizzled back from swapper_space to tmpfs file
+                         * mapping
+                         */
+                        if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+                                tree->ops->write_cache_pages_lock_hook(page);
+                        else
+                                lock_page(page);
+                        if (unlikely(page->mapping != mapping)) {
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (!wbc->range_cyclic && page->index > end) {
+                                done = 1;
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (wbc->sync_mode != WB_SYNC_NONE) {
+                                if (PageWriteback(page))
+                                        flush_fn(data);
+                                wait_on_page_writeback(page);
+                        }
+                        if (PageWriteback(page) ||
+                            !clear_page_dirty_for_io(page)) {
+                                unlock_page(page);
+                                continue;
+                        }
+                        ret = (*writepage)(page, wbc, data);
+                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+                                unlock_page(page);
+                                ret = 0;
+                        }
+                        if (ret || wbc->nr_to_write <= 0)
+                                done = 1;
+                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                                wbc->encountered_congestion = 1;
+                                done = 1;
+                        }
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        if (!scanned && !done) {
+                /*
+                 * We hit the last page and there is more work to be done: wrap
+                 * back to the start of the file
+                 */
+                scanned = 1;
+                index = 0;
+                goto retry;
+        }
+        return ret;
+}
+static noinline void flush_write_bio(void *data)
+{
+        struct extent_page_data *epd = data;
+        if (epd->bio) {
+                submit_one_bio(WRITE, epd->bio, 0, 0);
+                epd->bio = NULL;
+        }
+}
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+                          get_extent_t *get_extent,
+                          struct writeback_control *wbc)
+{
+        int ret;
+        struct address_space *mapping = page->mapping;
+        struct extent_page_data epd = {
+                .bio = NULL,
+                .tree = tree,
+                .get_extent = get_extent,
+                .extent_locked = 0,
+        };
+        struct writeback_control wbc_writepages = {
+                .bdi            = wbc->bdi,
+                .sync_mode      = WB_SYNC_NONE,
+                .older_than_this = NULL,
+                .nr_to_write    = 64,
+                .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
+                .range_end      = (loff_t)-1,
+        };
+        ret = __extent_writepage(page, wbc, &epd);
+        extent_write_cache_pages(tree, mapping, &wbc_writepages,
+                                 __extent_writepage, &epd, flush_write_bio);
+        if (epd.bio)
+                submit_one_bio(WRITE, epd.bio, 0, 0);
+        return ret;
+}
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+                              u64 start, u64 end, get_extent_t *get_extent,
+                              int mode)
+{
+        int ret = 0;
+        struct address_space *mapping = inode->i_mapping;
+        struct page *page;
+        unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+                PAGE_CACHE_SHIFT;
+        struct extent_page_data epd = {
+                .bio = NULL,
+                .tree = tree,
+                .get_extent = get_extent,
+                .extent_locked = 1,
+        };
+        struct writeback_control wbc_writepages = {
+                .bdi            = inode->i_mapping->backing_dev_info,
+                .sync_mode      = mode,
+                .older_than_this = NULL,
+                .nr_to_write    = nr_pages * 2,
+                .range_start    = start,
+                .range_end      = end + 1,
+        };
+        while (start <= end) {
+                page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+                if (clear_page_dirty_for_io(page))
+                        ret = __extent_writepage(page, &wbc_writepages, &epd);
+                else {
+                        if (tree->ops && tree->ops->writepage_end_io_hook)
+                                tree->ops->writepage_end_io_hook(page, start,
+                                                 start + PAGE_CACHE_SIZE - 1,
+                                                 NULL, 1);
+                        unlock_page(page);
+                }
+                page_cache_release(page);
+                start += PAGE_CACHE_SIZE;
+        }
+        if (epd.bio)
+                submit_one_bio(WRITE, epd.bio, 0, 0);
+        return ret;
+}
+int extent_writepages(struct extent_io_tree *tree,
+                      struct address_space *mapping,
+                      get_extent_t *get_extent,
+                      struct writeback_control *wbc)
+{
+        int ret = 0;
+        struct extent_page_data epd = {
+                .bio = NULL,
+                .tree = tree,
+                .get_extent = get_extent,
+                .extent_locked = 0,
+        };
+        ret = extent_write_cache_pages(tree, mapping, wbc,
+                                       __extent_writepage, &epd,
+                                       flush_write_bio);
+        if (epd.bio)
+                submit_one_bio(WRITE, epd.bio, 0, 0);
+        return ret;
+}
+int extent_readpages(struct extent_io_tree *tree,
+                     struct address_space *mapping,
+                     struct list_head *pages, unsigned nr_pages,
+                     get_extent_t get_extent)
+{
+        struct bio *bio = NULL;
+        unsigned page_idx;
+        struct pagevec pvec;
+        unsigned long bio_flags = 0;
+        pagevec_init(&pvec, 0);
+        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+                struct page *page = list_entry(pages->prev, struct page, lru);
+                prefetchw(&page->flags);
+                list_del(&page->lru);
+                /*
+                 * what we want to do here is call add_to_page_cache_lru,
+                 * but that isn't exported, so we reproduce it here
+                 */
+                if (!add_to_page_cache(page, mapping,
+                                        page->index, GFP_KERNEL)) {
+                        /* open coding of lru_cache_add, also not exported */
+                        page_cache_get(page);
+                        if (!pagevec_add(&pvec, page))
+                                __pagevec_lru_add_file(&pvec);
+                        __extent_read_full_page(tree, page, get_extent,
+                                                &bio, 0, &bio_flags);
+                }
+                page_cache_release(page);
+        }
+        if (pagevec_count(&pvec))
+                __pagevec_lru_add_file(&pvec);
+        BUG_ON(!list_empty(pages));
+        if (bio)
+                submit_one_bio(READ, bio, 0, bio_flags);
+        return 0;
+}
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+                          struct page *page, unsigned long offset)
+{
+        u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+        start += (offset + blocksize - 1) & ~(blocksize - 1);
+        if (start > end)
+                return 0;
+        lock_extent(tree, start, end, GFP_NOFS);
+        wait_on_extent_writeback(tree, start, end);
+        clear_extent_bit(tree, start, end,
+                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+                         1, 1, GFP_NOFS);
+        return 0;
+}
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_io_tree *tree,
+                        struct inode *inode, struct page *page,
+                        unsigned from, unsigned to)
+{
+        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+        set_page_extent_mapped(page);
+        set_page_dirty(page);
+        if (pos > inode->i_size) {
+                i_size_write(inode, pos);
+                mark_inode_dirty(inode);
+        }
+        return 0;
+}
+int extent_prepare_write(struct extent_io_tree *tree,
+                         struct inode *inode, struct page *page,
+                         unsigned from, unsigned to, get_extent_t *get_extent)
+{
+        u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+        u64 block_start;
+        u64 orig_block_start;
+        u64 block_end;
+        u64 cur_end;
+        struct extent_map *em;
+        unsigned blocksize = 1 << inode->i_blkbits;
+        size_t page_offset = 0;
+        size_t block_off_start;
+        size_t block_off_end;
+        int err = 0;
+        int iocount = 0;
+        int ret = 0;
+        int isnew;
+        set_page_extent_mapped(page);
+        block_start = (page_start + from) & ~((u64)blocksize - 1);
+        block_end = (page_start + to - 1) | (blocksize - 1);
+        orig_block_start = block_start;
+        lock_extent(tree, page_start, page_end, GFP_NOFS);
+        while (block_start <= block_end) {
+                em = get_extent(inode, page, page_offset, block_start,
+                                block_end - block_start + 1, 1);
+                if (IS_ERR(em) || !em)
+                        goto err;
+                cur_end = min(block_end, extent_map_end(em) - 1);
+                block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+                block_off_end = block_off_start + blocksize;
+                isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+                if (!PageUptodate(page) && isnew &&
+                    (block_off_end > to || block_off_start < from)) {
+                        void *kaddr;
+                        kaddr = kmap_atomic(page, KM_USER0);
+                        if (block_off_end > to)
+                                memset(kaddr + to, 0, block_off_end - to);
+                        if (block_off_start < from)
+                                memset(kaddr + block_off_start, 0,
+                                       from - block_off_start);
+                        flush_dcache_page(page);
+                        kunmap_atomic(kaddr, KM_USER0);
+                }
+                if ((em->block_start != EXTENT_MAP_HOLE &&
+                     em->block_start != EXTENT_MAP_INLINE) &&
+                    !isnew && !PageUptodate(page) &&
+                    (block_off_end > to || block_off_start < from) &&
+                    !test_range_bit(tree, block_start, cur_end,
+                                    EXTENT_UPTODATE, 1)) {
+                        u64 sector;
+                        u64 extent_offset = block_start - em->start;
+                        size_t iosize;
+                        sector = (em->block_start + extent_offset) >> 9;
+                        iosize = (cur_end - block_start + blocksize) &
+                                ~((u64)blocksize - 1);
+                        /*
+                         * we've already got the extent locked, but we
+                         * need to split the state such that our end_bio
+                         * handler can clear the lock.
+                         */
+                        set_extent_bit(tree, block_start,
+                                       block_start + iosize - 1,
+                                       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+                        ret = submit_extent_page(READ, tree, page,
+                                         sector, iosize, page_offset, em->bdev,
+                                         NULL, 1,
+                                         end_bio_extent_preparewrite, 0,
+                                         0, 0);
+                        iocount++;
+                        block_start = block_start + iosize;
+                } else {
+                        set_extent_uptodate(tree, block_start, cur_end,
+                                            GFP_NOFS);
+                        unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+                        block_start = cur_end + 1;
+                }
+                page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+                free_extent_map(em);
+        }
+        if (iocount) {
+                wait_extent_bit(tree, orig_block_start,
+                                block_end, EXTENT_LOCKED);
+        }
+        check_page_uptodate(tree, page);
+err:
+        /* FIXME, zero out newly allocated blocks on error */
+        return err;
+}
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+int try_release_extent_state(struct extent_map_tree *map,
+                             struct extent_io_tree *tree, struct page *page,
+                             gfp_t mask)
+{
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        int ret = 1;
+        if (test_range_bit(tree, start, end,
+                           EXTENT_IOBITS | EXTENT_ORDERED, 0))
+                ret = 0;
+        else {
+                if ((mask & GFP_NOFS) == GFP_NOFS)
+                        mask = GFP_NOFS;
+                clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+                                 1, 1, mask);
+        }
+        return ret;
+}
+/*
+ * a helper for releasepage.  As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+                               struct extent_io_tree *tree, struct page *page,
+                               gfp_t mask)
+{
+        struct extent_map *em;
+        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 end = start + PAGE_CACHE_SIZE - 1;
+        if ((mask & __GFP_WAIT) &&
+            page->mapping->host->i_size > 16 * 1024 * 1024) {
+                u64 len;
+                while (start <= end) {
+                        len = end - start + 1;
+                        spin_lock(&map->lock);
+                        em = lookup_extent_mapping(map, start, len);
+                        if (!em || IS_ERR(em)) {
+                                spin_unlock(&map->lock);
+                                break;
+                        }
+                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+                            em->start != start) {
+                                spin_unlock(&map->lock);
+                                free_extent_map(em);
+                                break;
+                        }
+                        if (!test_range_bit(tree, em->start,
+                                            extent_map_end(em) - 1,
+                                            EXTENT_LOCKED | EXTENT_WRITEBACK |
+                                            EXTENT_ORDERED,
+                                            0)) {
+                                remove_extent_mapping(map, em);
+                                /* once for the rb tree */
+                                free_extent_map(em);
+                        }
+                        start = extent_map_end(em);
+                        spin_unlock(&map->lock);
+                        /* once for us */
+                        free_extent_map(em);
+                }
+        }
+        return try_release_extent_state(map, tree, page, mask);
+}
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+                get_extent_t *get_extent)
+{
+        struct inode *inode = mapping->host;
+        u64 start = iblock << inode->i_blkbits;
+        sector_t sector = 0;
+        size_t blksize = (1 << inode->i_blkbits);
+        struct extent_map *em;
+        lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+                    GFP_NOFS);
+        em = get_extent(inode, NULL, 0, start, blksize, 0);
+        unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+                      GFP_NOFS);
+        if (!em || IS_ERR(em))
+                return 0;
+        if (em->block_start > EXTENT_MAP_LAST_BYTE)
+                goto out;
+        sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+out:
+        free_extent_map(em);
+        return sector;
+}
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+                                              unsigned long i)
+{
+        struct page *p;
+        struct address_space *mapping;
+        if (i == 0)
+                return eb->first_page;
+        i += eb->start >> PAGE_CACHE_SHIFT;
+        mapping = eb->first_page->mapping;
+        if (!mapping)
+                return NULL;
+        /*
+         * extent_buffer_page is only called after pinning the page
+         * by increasing the reference count.  So we know the page must
+         * be in the radix tree.
+         */
+        rcu_read_lock();
+        p = radix_tree_lookup(&mapping->page_tree, i);
+        rcu_read_unlock();
+        return p;
+}
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+        return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+                (start >> PAGE_CACHE_SHIFT);
+}
+static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
+                                                   u64 start,
+                                                   unsigned long len,
+                                                   gfp_t mask)
+{
+        struct extent_buffer *eb = NULL;
+#ifdef LEAK_DEBUG
+        unsigned long flags;
+#endif
+        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+        eb->start = start;
+        eb->len = len;
+        mutex_init(&eb->mutex);
+#ifdef LEAK_DEBUG
+        spin_lock_irqsave(&leak_lock, flags);
+        list_add(&eb->leak_list, &buffers);
+        spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+        atomic_set(&eb->refs, 1);
+        return eb;
+}
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#ifdef LEAK_DEBUG
+        unsigned long flags;
+        spin_lock_irqsave(&leak_lock, flags);
+        list_del(&eb->leak_list);
+        spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+        kmem_cache_free(extent_buffer_cache, eb);
+}
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+                                          u64 start, unsigned long len,
+                                          struct page *page0,
+                                          gfp_t mask)
+{
+        unsigned long num_pages = num_extent_pages(start, len);
+        unsigned long i;
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        struct extent_buffer *eb;
+        struct extent_buffer *exists = NULL;
+        struct page *p;
+        struct address_space *mapping = tree->mapping;
+        int uptodate = 1;
+        spin_lock(&tree->buffer_lock);
+        eb = buffer_search(tree, start);
+        if (eb) {
+                atomic_inc(&eb->refs);
+                spin_unlock(&tree->buffer_lock);
+                mark_page_accessed(eb->first_page);
+                return eb;
+        }
+        spin_unlock(&tree->buffer_lock);
+        eb = __alloc_extent_buffer(tree, start, len, mask);
+        if (!eb)
+                return NULL;
+        if (page0) {
+                eb->first_page = page0;
+                i = 1;
+                index++;
+                page_cache_get(page0);
+                mark_page_accessed(page0);
+                set_page_extent_mapped(page0);
+                set_page_extent_head(page0, len);
+                uptodate = PageUptodate(page0);
+        } else {
+                i = 0;
+        }
+        for (; i < num_pages; i++, index++) {
+                p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+                if (!p) {
+                        WARN_ON(1);
+                        goto free_eb;
+                }
+                set_page_extent_mapped(p);
+                mark_page_accessed(p);
+                if (i == 0) {
+                        eb->first_page = p;
+                        set_page_extent_head(p, len);
+                } else {
+                        set_page_private(p, EXTENT_PAGE_PRIVATE);
+                }
+                if (!PageUptodate(p))
+                        uptodate = 0;
+                unlock_page(p);
+        }
+        if (uptodate)
+                eb->flags |= EXTENT_UPTODATE;
+        eb->flags |= EXTENT_BUFFER_FILLED;
+        spin_lock(&tree->buffer_lock);
+        exists = buffer_tree_insert(tree, start, &eb->rb_node);
+        if (exists) {
+                /* add one reference for the caller */
+                atomic_inc(&exists->refs);
+                spin_unlock(&tree->buffer_lock);
+                goto free_eb;
+        }
+        spin_unlock(&tree->buffer_lock);
+        /* add one reference for the tree */
+        atomic_inc(&eb->refs);
+        return eb;
+free_eb:
+        if (!atomic_dec_and_test(&eb->refs))
+                return exists;
+        for (index = 1; index < i; index++)
+                page_cache_release(extent_buffer_page(eb, index));
+        page_cache_release(extent_buffer_page(eb, 0));
+        __free_extent_buffer(eb);
+        return exists;
+}
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+                                         u64 start, unsigned long len,
+                                          gfp_t mask)
+{
+        struct extent_buffer *eb;
+        spin_lock(&tree->buffer_lock);
+        eb = buffer_search(tree, start);
+        if (eb)
+                atomic_inc(&eb->refs);
+        spin_unlock(&tree->buffer_lock);
+        if (eb)
+                mark_page_accessed(eb->first_page);
+        return eb;
+}
+void free_extent_buffer(struct extent_buffer *eb)
+{
+        if (!eb)
+                return;
+        if (!atomic_dec_and_test(&eb->refs))
+                return;
+        WARN_ON(1);
+}
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+                              struct extent_buffer *eb)
+{
+        int set;
+        unsigned long i;
+        unsigned long num_pages;
+        struct page *page;
+        u64 start = eb->start;
+        u64 end = start + eb->len - 1;
+        set = clear_extent_dirty(tree, start, end, GFP_NOFS);
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = 0; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (!set && !PageDirty(page))
+                        continue;
+                lock_page(page);
+                if (i == 0)
+                        set_page_extent_head(page, eb->len);
+                else
+                        set_page_private(page, EXTENT_PAGE_PRIVATE);
+                /*
+                 * if we're on the last page or the first page and the
+                 * block isn't aligned on a page boundary, do extra checks
+                 * to make sure we don't clean page that is partially dirty
+                 */
+                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+                    ((i == num_pages - 1) &&
+                     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+                        start = (u64)page->index << PAGE_CACHE_SHIFT;
+                        end  = start + PAGE_CACHE_SIZE - 1;
+                        if (test_range_bit(tree, start, end,
+                                           EXTENT_DIRTY, 0)) {
+                                unlock_page(page);
+                                continue;
+                        }
+                }
+                clear_page_dirty_for_io(page);
+                spin_lock_irq(&page->mapping->tree_lock);
+                if (!PageDirty(page)) {
+                        radix_tree_tag_clear(&page->mapping->page_tree,
+                                                page_index(page),
+                                                PAGECACHE_TAG_DIRTY);
+                }
+                spin_unlock_irq(&page->mapping->tree_lock);
+                unlock_page(page);
+        }
+        return 0;
+}
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+                                    struct extent_buffer *eb)
+{
+        return wait_on_extent_writeback(tree, eb->start,
+                                        eb->start + eb->len - 1);
+}
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+                             struct extent_buffer *eb)
+{
+        unsigned long i;
+        unsigned long num_pages;
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = 0; i < num_pages; i++) {
+                struct page *page = extent_buffer_page(eb, i);
+                /* writepage may need to do something special for the
+                 * first page, we have to make sure page->private is
+                 * properly set.  releasepage may drop page->private
+                 * on us if the page isn't already dirty.
+                 */
+                lock_page(page);
+                if (i == 0) {
+                        set_page_extent_head(page, eb->len);
+                } else if (PagePrivate(page) &&
+                           page->private != EXTENT_PAGE_PRIVATE) {
+                        set_page_extent_mapped(page);
+                }
+                __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+                set_extent_dirty(tree, page_offset(page),
+                                 page_offset(page) + PAGE_CACHE_SIZE - 1,
+                                 GFP_NOFS);
+                unlock_page(page);
+        }
+        return 0;
+}
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+                                struct extent_buffer *eb)
+{
+        unsigned long i;
+        struct page *page;
+        unsigned long num_pages;
+        num_pages = num_extent_pages(eb->start, eb->len);
+        eb->flags &= ~EXTENT_UPTODATE;
+        clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                              GFP_NOFS);
+        for (i = 0; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (page)
+                        ClearPageUptodate(page);
+        }
+        return 0;
+}
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+                                struct extent_buffer *eb)
+{
+        unsigned long i;
+        struct page *page;
+        unsigned long num_pages;
+        num_pages = num_extent_pages(eb->start, eb->len);
+        set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+                            GFP_NOFS);
+        for (i = 0; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+                    ((i == num_pages - 1) &&
+                     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+                        check_page_uptodate(tree, page);
+                        continue;
+                }
+                SetPageUptodate(page);
+        }
+        return 0;
+}
+int extent_range_uptodate(struct extent_io_tree *tree,
+                          u64 start, u64 end)
+{
+        struct page *page;
+        int ret;
+        int pg_uptodate = 1;
+        int uptodate;
+        unsigned long index;
+        ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+        if (ret)
+                return 1;
+        while (start <= end) {
+                index = start >> PAGE_CACHE_SHIFT;
+                page = find_get_page(tree->mapping, index);
+                uptodate = PageUptodate(page);
+                page_cache_release(page);
+                if (!uptodate) {
+                        pg_uptodate = 0;
+                        break;
+                }
+                start += PAGE_CACHE_SIZE;
+        }
+        return pg_uptodate;
+}
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+                           struct extent_buffer *eb)
+{
+        int ret = 0;
+        unsigned long num_pages;
+        unsigned long i;
+        struct page *page;
+        int pg_uptodate = 1;
+        if (eb->flags & EXTENT_UPTODATE)
+                return 1;
+        ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+                           EXTENT_UPTODATE, 1);
+        if (ret)
+                return ret;
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = 0; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (!PageUptodate(page)) {
+                        pg_uptodate = 0;
+                        break;
+                }
+        }
+        return pg_uptodate;
+}
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+                             struct extent_buffer *eb,
+                             u64 start, int wait,
+                             get_extent_t *get_extent, int mirror_num)
+{
+        unsigned long i;
+        unsigned long start_i;
+        struct page *page;
+        int err;
+        int ret = 0;
+        int locked_pages = 0;
+        int all_uptodate = 1;
+        int inc_all_pages = 0;
+        unsigned long num_pages;
+        struct bio *bio = NULL;
+        unsigned long bio_flags = 0;
+        if (eb->flags & EXTENT_UPTODATE)
+                return 0;
+        if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+                           EXTENT_UPTODATE, 1)) {
+                return 0;
+        }
+        if (start) {
+                WARN_ON(start < eb->start);
+                start_i = (start >> PAGE_CACHE_SHIFT) -
+                        (eb->start >> PAGE_CACHE_SHIFT);
+        } else {
+                start_i = 0;
+        }
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = start_i; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (!wait) {
+                        if (!trylock_page(page))
+                                goto unlock_exit;
+                } else {
+                        lock_page(page);
+                }
+                locked_pages++;
+                if (!PageUptodate(page))
+                        all_uptodate = 0;
+        }
+        if (all_uptodate) {
+                if (start_i == 0)
+                        eb->flags |= EXTENT_UPTODATE;
+                goto unlock_exit;
+        }
+        for (i = start_i; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                if (inc_all_pages)
+                        page_cache_get(page);
+                if (!PageUptodate(page)) {
+                        if (start_i == 0)
+                                inc_all_pages = 1;
+                        ClearPageError(page);
+                        err = __extent_read_full_page(tree, page,
+                                                      get_extent, &bio,
+                                                      mirror_num, &bio_flags);
+                        if (err)
+                                ret = err;
+                } else {
+                        unlock_page(page);
+                }
+        }
+        if (bio)
+                submit_one_bio(READ, bio, mirror_num, bio_flags);
+        if (ret || !wait)
+                return ret;
+        for (i = start_i; i < num_pages; i++) {
+                page = extent_buffer_page(eb, i);
+                wait_on_page_locked(page);
+                if (!PageUptodate(page))
+                        ret = -EIO;
+        }
+        if (!ret)
+                eb->flags |= EXTENT_UPTODATE;
+        return ret;
+unlock_exit:
+        i = start_i;
+        while (locked_pages > 0) {
+                page = extent_buffer_page(eb, i);
+                i++;
+                unlock_page(page);
+                locked_pages--;
+        }
+        return ret;
+}
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+                        unsigned long start,
+                        unsigned long len)
+{
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        char *dst = (char *)dstv;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        WARN_ON(start > eb->len);
+        WARN_ON(start + len > eb->start + eb->len);
+        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(eb, i);
+                cur = min(len, (PAGE_CACHE_SIZE - offset));
+                kaddr = kmap_atomic(page, KM_USER1);
+                memcpy(dst, kaddr + offset, cur);
+                kunmap_atomic(kaddr, KM_USER1);
+                dst += cur;
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+}
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+                               unsigned long min_len, char **token, char **map,
+                               unsigned long *map_start,
+                               unsigned long *map_len, int km)
+{
+        size_t offset = start & (PAGE_CACHE_SIZE - 1);
+        char *kaddr;
+        struct page *p;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        unsigned long end_i = (start_offset + start + min_len - 1) >>
+                PAGE_CACHE_SHIFT;
+        if (i != end_i)
+                return -EINVAL;
+        if (i == 0) {
+                offset = start_offset;
+                *map_start = 0;
+        } else {
+                offset = 0;
+                *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+        }
+        if (start + min_len > eb->len) {
+                printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+                       "wanted %lu %lu\n", (unsigned long long)eb->start,
+                       eb->len, start, min_len);
+                WARN_ON(1);
+        }
+        p = extent_buffer_page(eb, i);
+        kaddr = kmap_atomic(p, km);
+        *token = kaddr;
+        *map = kaddr + offset;
+        *map_len = PAGE_CACHE_SIZE - offset;
+        return 0;
+}
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+                      unsigned long min_len,
+                      char **token, char **map,
+                      unsigned long *map_start,
+                      unsigned long *map_len, int km)
+{
+        int err;
+        int save = 0;
+        if (eb->map_token) {
+                unmap_extent_buffer(eb, eb->map_token, km);
+                eb->map_token = NULL;
+                save = 1;
+                WARN_ON(!mutex_is_locked(&eb->mutex));
+        }
+        err = map_private_extent_buffer(eb, start, min_len, token, map,
+                                       map_start, map_len, km);
+        if (!err && save) {
+                eb->map_token = *token;
+                eb->kaddr = *map;
+                eb->map_start = *map_start;
+                eb->map_len = *map_len;
+        }
+        return err;
+}
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
+{
+        kunmap_atomic(token, km);
+}
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+                          unsigned long start,
+                          unsigned long len)
+{
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        char *ptr = (char *)ptrv;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        int ret = 0;
+        WARN_ON(start > eb->len);
+        WARN_ON(start + len > eb->start + eb->len);
+        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(eb, i);
+                cur = min(len, (PAGE_CACHE_SIZE - offset));
+                kaddr = kmap_atomic(page, KM_USER0);
+                ret = memcmp(ptr, kaddr + offset, cur);
+                kunmap_atomic(kaddr, KM_USER0);
+                if (ret)
+                        break;
+                ptr += cur;
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+        return ret;
+}
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+                         unsigned long start, unsigned long len)
+{
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        char *src = (char *)srcv;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        WARN_ON(start > eb->len);
+        WARN_ON(start + len > eb->start + eb->len);
+        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(eb, i);
+                WARN_ON(!PageUptodate(page));
+                cur = min(len, PAGE_CACHE_SIZE - offset);
+                kaddr = kmap_atomic(page, KM_USER1);
+                memcpy(kaddr + offset, src, cur);
+                kunmap_atomic(kaddr, KM_USER1);
+                src += cur;
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+}
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+                          unsigned long start, unsigned long len)
+{
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+        WARN_ON(start > eb->len);
+        WARN_ON(start + len > eb->start + eb->len);
+        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(eb, i);
+                WARN_ON(!PageUptodate(page));
+                cur = min(len, PAGE_CACHE_SIZE - offset);
+                kaddr = kmap_atomic(page, KM_USER0);
+                memset(kaddr + offset, c, cur);
+                kunmap_atomic(kaddr, KM_USER0);
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+}
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+                        unsigned long dst_offset, unsigned long src_offset,
+                        unsigned long len)
+{
+        u64 dst_len = dst->len;
+        size_t cur;
+        size_t offset;
+        struct page *page;
+        char *kaddr;
+        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+        WARN_ON(src->len != dst_len);
+        offset = (start_offset + dst_offset) &
+                ((unsigned long)PAGE_CACHE_SIZE - 1);
+        while (len > 0) {
+                page = extent_buffer_page(dst, i);
+                WARN_ON(!PageUptodate(page));
+                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+                kaddr = kmap_atomic(page, KM_USER0);
+                read_extent_buffer(src, kaddr + offset, src_offset, cur);
+                kunmap_atomic(kaddr, KM_USER0);
+                src_offset += cur;
+                len -= cur;
+                offset = 0;
+                i++;
+        }
+}
+static void move_pages(struct page *dst_page, struct page *src_page,
+                       unsigned long dst_off, unsigned long src_off,
+                       unsigned long len)
+{
+        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+        if (dst_page == src_page) {
+                memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
+        } else {
+                char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+                char *p = dst_kaddr + dst_off + len;
+                char *s = src_kaddr + src_off + len;
+                while (len--)
+                        *--p = *--s;
+                kunmap_atomic(src_kaddr, KM_USER1);
+        }
+        kunmap_atomic(dst_kaddr, KM_USER0);
+}
+static void copy_pages(struct page *dst_page, struct page *src_page,
+                       unsigned long dst_off, unsigned long src_off,
+                       unsigned long len)
+{
+        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+        char *src_kaddr;
+        if (dst_page != src_page)
+                src_kaddr = kmap_atomic(src_page, KM_USER1);
+        else
+                src_kaddr = dst_kaddr;
+        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+        kunmap_atomic(dst_kaddr, KM_USER0);
+        if (dst_page != src_page)
+                kunmap_atomic(src_kaddr, KM_USER1);
+}
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+                           unsigned long src_offset, unsigned long len)
+{
+        size_t cur;
+        size_t dst_off_in_page;
+        size_t src_off_in_page;
+        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long dst_i;
+        unsigned long src_i;
+        if (src_offset + len > dst->len) {
+                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+                       "len %lu dst len %lu\n", src_offset, len, dst->len);
+                BUG_ON(1);
+        }
+        if (dst_offset + len > dst->len) {
+                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+                       "len %lu dst len %lu\n", dst_offset, len, dst->len);
+                BUG_ON(1);
+        }
+        while (len > 0) {
+                dst_off_in_page = (start_offset + dst_offset) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+                src_off_in_page = (start_offset + src_offset) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+                dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+                src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+                                               src_off_in_page));
+                cur = min_t(unsigned long, cur,
+                        (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+                copy_pages(extent_buffer_page(dst, dst_i),
+                           extent_buffer_page(dst, src_i),
+                           dst_off_in_page, src_off_in_page, cur);
+                src_offset += cur;
+                dst_offset += cur;
+                len -= cur;
+        }
+}
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+                           unsigned long src_offset, unsigned long len)
+{
+        size_t cur;
+        size_t dst_off_in_page;
+        size_t src_off_in_page;
+        unsigned long dst_end = dst_offset + len - 1;
+        unsigned long src_end = src_offset + len - 1;
+        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+        unsigned long dst_i;
+        unsigned long src_i;
+        if (src_offset + len > dst->len) {
+                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+                       "len %lu len %lu\n", src_offset, len, dst->len);
+                BUG_ON(1);
+        }
+        if (dst_offset + len > dst->len) {
+                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+                       "len %lu len %lu\n", dst_offset, len, dst->len);
+                BUG_ON(1);
+        }
+        if (dst_offset < src_offset) {
+                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+                return;
+        }
+        while (len > 0) {
+                dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+                src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+                dst_off_in_page = (start_offset + dst_end) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+                src_off_in_page = (start_offset + src_end) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+                cur = min_t(unsigned long, len, src_off_in_page + 1);
+                cur = min(cur, dst_off_in_page + 1);
+                move_pages(extent_buffer_page(dst, dst_i),
+                           extent_buffer_page(dst, src_i),
+                           dst_off_in_page - cur + 1,
+                           src_off_in_page - cur + 1, cur);
+                dst_end -= cur;
+                src_end -= cur;
+                len -= cur;
+        }
+}
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+{
+        u64 start = page_offset(page);
+        struct extent_buffer *eb;
+        int ret = 1;
+        unsigned long i;
+        unsigned long num_pages;
+        spin_lock(&tree->buffer_lock);
+        eb = buffer_search(tree, start);
+        if (!eb)
+                goto out;
+        if (atomic_read(&eb->refs) > 1) {
+                ret = 0;
+                goto out;
+        }
+        /* at this point we can safely release the extent buffer */
+        num_pages = num_extent_pages(eb->start, eb->len);
+        for (i = 0; i < num_pages; i++)
+                page_cache_release(extent_buffer_page(eb, i));
+        rb_erase(&eb->rb_node, &tree->buffer);
+        __free_extent_buffer(eb);
+out:
+        spin_unlock(&tree->buffer_lock);
+        return ret;
+}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 000000000000..c5b483a79137
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,269 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+#include <linux/rbtree.h>
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG_DONE (1 << 7)
+#define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_ORDERED (1 << 9)
+#define EXTENT_ORDERED_METADATA (1 << 10)
+#define EXTENT_BOUNDARY (1 << 11)
+#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+/*
+ * page->private values.  Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
+struct extent_state;
+typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
+                                       struct bio *bio, int mirror_num,
+                                       unsigned long bio_flags);
+struct extent_io_ops {
+        int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+                             u64 start, u64 end, int *page_started,
+                             unsigned long *nr_written);
+        int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
+        int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+        extent_submit_bio_hook_t *submit_bio_hook;
+        int (*merge_bio_hook)(struct page *page, unsigned long offset,
+                              size_t size, struct bio *bio,
+                              unsigned long bio_flags);
+        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+        int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
+                                       u64 start, u64 end,
+                                       struct extent_state *state);
+        int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
+                                        u64 start, u64 end,
+                                       struct extent_state *state);
+        int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
+                                    struct extent_state *state);
+        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+                                      struct extent_state *state, int uptodate);
+        int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+                            unsigned long old, unsigned long bits);
+        int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
+                            unsigned long old, unsigned long bits);
+        int (*write_cache_pages_lock_hook)(struct page *page);
+};
+struct extent_io_tree {
+        struct rb_root state;
+        struct rb_root buffer;
+        struct address_space *mapping;
+        u64 dirty_bytes;
+        spinlock_t lock;
+        spinlock_t buffer_lock;
+        struct extent_io_ops *ops;
+};
+struct extent_state {
+        u64 start;
+        u64 end; /* inclusive */
+        struct rb_node rb_node;
+        struct extent_io_tree *tree;
+        wait_queue_head_t wq;
+        atomic_t refs;
+        unsigned long state;
+        /* for use by the FS */
+        u64 private;
+        struct list_head leak_list;
+};
+struct extent_buffer {
+        u64 start;
+        unsigned long len;
+        char *map_token;
+        char *kaddr;
+        unsigned long map_start;
+        unsigned long map_len;
+        struct page *first_page;
+        atomic_t refs;
+        int flags;
+        struct list_head leak_list;
+        struct rb_node rb_node;
+        struct mutex mutex;
+};
+struct extent_map_tree;
+static inline struct extent_state *extent_state_next(struct extent_state *state)
+{
+        struct rb_node *node;
+        node = rb_next(&state->rb_node);
+        if (!node)
+                return NULL;
+        return rb_entry(node, struct extent_state, rb_node);
+}
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+                                          struct page *page,
+                                          size_t page_offset,
+                                          u64 start, u64 len,
+                                          int create);
+void extent_io_tree_init(struct extent_io_tree *tree,
+                          struct address_space *mapping, gfp_t mask);
+int try_release_extent_mapping(struct extent_map_tree *map,
+                               struct extent_io_tree *tree, struct page *page,
+                               gfp_t mask);
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
+int try_release_extent_state(struct extent_map_tree *map,
+                             struct extent_io_tree *tree, struct page *page,
+                             gfp_t mask);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+                    gfp_t mask);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+                          get_extent_t *get_extent);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+u64 count_range_bits(struct extent_io_tree *tree,
+                     u64 *start, u64 search_end,
+                     u64 max_bytes, unsigned long bits);
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                   int bits, int filled);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                      int bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                     int bits, int wake, int delete, gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                    int bits, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                        gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+                   gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask);
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                       gfp_t mask);
+int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
+                                  u64 end, gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask);
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+                     gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+                          u64 *start_ret, u64 *end_ret, int bits);
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+                                                 u64 start, int bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+                          struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+                          get_extent_t *get_extent,
+                          struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+                              u64 start, u64 end, get_extent_t *get_extent,
+                              int mode);
+int extent_writepages(struct extent_io_tree *tree,
+                      struct address_space *mapping,
+                      get_extent_t *get_extent,
+                      struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+                     struct address_space *mapping,
+                     struct list_head *pages, unsigned nr_pages,
+                     get_extent_t get_extent);
+int extent_prepare_write(struct extent_io_tree *tree,
+                         struct inode *inode, struct page *page,
+                         unsigned from, unsigned to, get_extent_t *get_extent);
+int extent_commit_write(struct extent_io_tree *tree,
+                        struct inode *inode, struct page *page,
+                        unsigned from, unsigned to);
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+                get_extent_t *get_extent);
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+                                          u64 start, unsigned long len,
+                                          struct page *page0,
+                                          gfp_t mask);
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+                                         u64 start, unsigned long len,
+                                          gfp_t mask);
+void free_extent_buffer(struct extent_buffer *eb);
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+                             struct extent_buffer *eb, u64 start, int wait,
+                             get_extent_t *get_extent, int mirror_num);
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+        atomic_inc(&eb->refs);
+}
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+                          unsigned long start,
+                          unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+                        unsigned long start,
+                        unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+                         unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+                        unsigned long dst_offset, unsigned long src_offset,
+                        unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+                           unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+                           unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+                          unsigned long start, unsigned long len);
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+                                    struct extent_buffer *eb);
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+                              struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+                             struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+                               struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+                                struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+                           struct extent_buffer *eb);
+int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+                      unsigned long min_len, char **token, char **map,
+                      unsigned long *map_start,
+                      unsigned long *map_len, int km);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+                      unsigned long min_len, char **token, char **map,
+                      unsigned long *map_start,
+                      unsigned long *map_len, int km);
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+int release_extent_buffer_tail_pages(struct extent_buffer *eb);
+int extent_range_uptodate(struct extent_io_tree *tree,
+                          u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+                                struct extent_io_tree *tree,
+                                u64 start, u64 end, struct page *locked_page,
+                                int unlock_page,
+                                int clear_unlock,
+                                int clear_delalloc, int clear_dirty,
+                                int set_writeback,
+                                int end_writeback);
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 000000000000..4a83e33ada32
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,351 @@
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/version.h>
+#include <linux/hardirq.h>
+#include "extent_map.h"
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+                                       unsigned long extra_flags,
+                                       void (*ctor)(void *, struct kmem_cache *,
+                                                    unsigned long));
+static struct kmem_cache *extent_map_cache;
+int __init extent_map_init(void)
+{
+        extent_map_cache = btrfs_cache_create("extent_map",
+                                            sizeof(struct extent_map), 0,
+                                            NULL);
+        if (!extent_map_cache)
+                return -ENOMEM;
+        return 0;
+}
+void extent_map_exit(void)
+{
+        if (extent_map_cache)
+                kmem_cache_destroy(extent_map_cache);
+}
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree:               tree to initialize
+ * @mask:               flags for memory allocations during tree operations
+ *
+ * Initialize the extent tree @tree.  Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+{
+        tree->map.rb_node = NULL;
+        spin_lock_init(&tree->lock);
+}
+EXPORT_SYMBOL(extent_map_tree_init);
+/**
+ * alloc_extent_map - allocate new extent map structure
+ * @mask:       memory allocation flags
+ *
+ * Allocate a new extent_map structure.  The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
+struct extent_map *alloc_extent_map(gfp_t mask)
+{
+        struct extent_map *em;
+        em = kmem_cache_alloc(extent_map_cache, mask);
+        if (!em || IS_ERR(em))
+                return em;
+        em->in_tree = 0;
+        em->flags = 0;
+        atomic_set(&em->refs, 1);
+        return em;
+}
+EXPORT_SYMBOL(alloc_extent_map);
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em:         extent map beeing releasead
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
+void free_extent_map(struct extent_map *em)
+{
+        if (!em)
+                return;
+        WARN_ON(atomic_read(&em->refs) == 0);
+        if (atomic_dec_and_test(&em->refs)) {
+                WARN_ON(em->in_tree);
+                kmem_cache_free(extent_map_cache, em);
+        }
+}
+EXPORT_SYMBOL(free_extent_map);
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct extent_map *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct extent_map, rb_node);
+                WARN_ON(!entry->in_tree);
+                if (offset < entry->start)
+                        p = &(*p)->rb_left;
+                else if (offset >= extent_map_end(entry))
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        entry = rb_entry(node, struct extent_map, rb_node);
+        entry->in_tree = 1;
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+/*
+ * search through the tree for an extent_map with a given offset.  If
+ * it can't be found, try to find some neighboring extents
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+                                     struct rb_node **prev_ret,
+                                     struct rb_node **next_ret)
+{
+        struct rb_node *n = root->rb_node;
+        struct rb_node *prev = NULL;
+        struct rb_node *orig_prev = NULL;
+        struct extent_map *entry;
+        struct extent_map *prev_entry = NULL;
+        while (n) {
+                entry = rb_entry(n, struct extent_map, rb_node);
+                prev = n;
+                prev_entry = entry;
+                WARN_ON(!entry->in_tree);
+                if (offset < entry->start)
+                        n = n->rb_left;
+                else if (offset >= extent_map_end(entry))
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        if (prev_ret) {
+                orig_prev = prev;
+                while (prev && offset >= extent_map_end(prev_entry)) {
+                        prev = rb_next(prev);
+                        prev_entry = rb_entry(prev, struct extent_map, rb_node);
+                }
+                *prev_ret = prev;
+                prev = orig_prev;
+        }
+        if (next_ret) {
+                prev_entry = rb_entry(prev, struct extent_map, rb_node);
+                while (prev && offset < prev_entry->start) {
+                        prev = rb_prev(prev);
+                        prev_entry = rb_entry(prev, struct extent_map, rb_node);
+                }
+                *next_ret = prev;
+        }
+        return NULL;
+}
+/*
+ * look for an offset in the tree, and if it can't be found, return
+ * the first offset we can find smaller than 'offset'.
+ */
+static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+{
+        struct rb_node *prev;
+        struct rb_node *ret;
+        ret = __tree_search(root, offset, &prev, NULL);
+        if (!ret)
+                return prev;
+        return ret;
+}
+/* check to see if two extent_map structs are adjacent and safe to merge */
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+        if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+                return 0;
+        /*
+         * don't merge compressed extents, we need to know their
+         * actual size
+         */
+        if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+                return 0;
+        if (extent_map_end(prev) == next->start &&
+            prev->flags == next->flags &&
+            prev->bdev == next->bdev &&
+            ((next->block_start == EXTENT_MAP_HOLE &&
+              prev->block_start == EXTENT_MAP_HOLE) ||
+             (next->block_start == EXTENT_MAP_INLINE &&
+              prev->block_start == EXTENT_MAP_INLINE) ||
+             (next->block_start == EXTENT_MAP_DELALLOC &&
+              prev->block_start == EXTENT_MAP_DELALLOC) ||
+             (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+              next->block_start == extent_map_block_end(prev)))) {
+                return 1;
+        }
+        return 0;
+}
+/**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree:       tree to insert new map in
+ * @em:         map to insert
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings.  The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was sucessfull.
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+                       struct extent_map *em)
+{
+        int ret = 0;
+        struct extent_map *merge = NULL;
+        struct rb_node *rb;
+        struct extent_map *exist;
+        exist = lookup_extent_mapping(tree, em->start, em->len);
+        if (exist) {
+                free_extent_map(exist);
+                ret = -EEXIST;
+                goto out;
+        }
+        assert_spin_locked(&tree->lock);
+        rb = tree_insert(&tree->map, em->start, &em->rb_node);
+        if (rb) {
+                ret = -EEXIST;
+                free_extent_map(merge);
+                goto out;
+        }
+        atomic_inc(&em->refs);
+        if (em->start != 0) {
+                rb = rb_prev(&em->rb_node);
+                if (rb)
+                        merge = rb_entry(rb, struct extent_map, rb_node);
+                if (rb && mergable_maps(merge, em)) {
+                        em->start = merge->start;
+                        em->len += merge->len;
+                        em->block_len += merge->block_len;
+                        em->block_start = merge->block_start;
+                        merge->in_tree = 0;
+                        rb_erase(&merge->rb_node, &tree->map);
+                        free_extent_map(merge);
+                }
+         }
+        rb = rb_next(&em->rb_node);
+        if (rb)
+                merge = rb_entry(rb, struct extent_map, rb_node);
+        if (rb && mergable_maps(em, merge)) {
+                em->len += merge->len;
+                em->block_len += merge->len;
+                rb_erase(&merge->rb_node, &tree->map);
+                merge->in_tree = 0;
+                free_extent_map(merge);
+        }
+out:
+        return ret;
+}
+EXPORT_SYMBOL(add_extent_mapping);
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+        if (start + len < start)
+                return (u64)-1;
+        return start + len;
+}
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree:       tree to lookup in
+ * @start:      byte offset to start the search
+ * @len:        length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.  There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+                                         u64 start, u64 len)
+{
+        struct extent_map *em;
+        struct rb_node *rb_node;
+        struct rb_node *prev = NULL;
+        struct rb_node *next = NULL;
+        u64 end = range_end(start, len);
+        assert_spin_locked(&tree->lock);
+        rb_node = __tree_search(&tree->map, start, &prev, &next);
+        if (!rb_node && prev) {
+                em = rb_entry(prev, struct extent_map, rb_node);
+                if (end > em->start && start < extent_map_end(em))
+                        goto found;
+        }
+        if (!rb_node && next) {
+                em = rb_entry(next, struct extent_map, rb_node);
+                if (end > em->start && start < extent_map_end(em))
+                        goto found;
+        }
+        if (!rb_node) {
+                em = NULL;
+                goto out;
+        }
+        if (IS_ERR(rb_node)) {
+                em = ERR_PTR(PTR_ERR(rb_node));
+                goto out;
+        }
+        em = rb_entry(rb_node, struct extent_map, rb_node);
+        if (end > em->start && start < extent_map_end(em))
+                goto found;
+        em = NULL;
+        goto out;
+found:
+        atomic_inc(&em->refs);
+out:
+        return em;
+}
+EXPORT_SYMBOL(lookup_extent_mapping);
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree:       extent tree to remove from
+ * @em:         extent map beeing removed
+ *
+ * Removes @em from @tree.  No reference counts are dropped, and no checks
+ * are done to see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+        int ret = 0;
+        WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+        assert_spin_locked(&tree->lock);
+        rb_erase(&em->rb_node, &tree->map);
+        em->in_tree = 0;
+        return ret;
+}
+EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 000000000000..fb6eeef06bb0
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,62 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+#include <linux/rbtree.h>
+#define EXTENT_MAP_LAST_BYTE (u64)-4
+#define EXTENT_MAP_HOLE (u64)-3
+#define EXTENT_MAP_INLINE (u64)-2
+#define EXTENT_MAP_DELALLOC (u64)-1
+/* bits for the flags field */
+#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
+#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+struct extent_map {
+        struct rb_node rb_node;
+        /* all of these are in bytes */
+        u64 start;
+        u64 len;
+        u64 orig_start;
+        u64 block_start;
+        u64 block_len;
+        unsigned long flags;
+        struct block_device *bdev;
+        atomic_t refs;
+        int in_tree;
+};
+struct extent_map_tree {
+        struct rb_root map;
+        spinlock_t lock;
+};
+static inline u64 extent_map_end(struct extent_map *em)
+{
+        if (em->start + em->len < em->start)
+                return (u64)-1;
+        return em->start + em->len;
+}
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+        if (em->block_start + em->block_len < em->block_start)
+                return (u64)-1;
+        return em->block_start + em->block_len;
+}
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+                                         u64 start, u64 len);
+int add_extent_mapping(struct extent_map_tree *tree,
+                       struct extent_map *em);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+struct extent_map *alloc_extent_map(gfp_t mask);
+void free_extent_map(struct extent_map *em);
+int __init extent_map_init(void);
+void extent_map_exit(void);
+#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 000000000000..964652435fd1
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,831 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+                                   sizeof(struct btrfs_item) * 2) / \
+                                  size) - 1))
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+                                   sizeof(struct btrfs_ordered_sum)) / \
+                                   sizeof(struct btrfs_sector_sum) * \
+                                   (r)->sectorsize - (r)->sectorsize)
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             u64 objectid, u64 pos,
+                             u64 disk_offset, u64 disk_num_bytes,
+                             u64 num_bytes, u64 offset, u64 ram_bytes,
+                             u8 compression, u8 encryption, u16 other_encoding)
+{
+        int ret = 0;
+        struct btrfs_file_extent_item *item;
+        struct btrfs_key file_key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        file_key.objectid = objectid;
+        file_key.offset = pos;
+        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+                                      sizeof(*item));
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0],
+                              struct btrfs_file_extent_item);
+        btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
+        btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
+        btrfs_set_file_extent_offset(leaf, item, offset);
+        btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+        btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
+        btrfs_set_file_extent_generation(leaf, item, trans->transid);
+        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+        btrfs_set_file_extent_compression(leaf, item, compression);
+        btrfs_set_file_extent_encryption(leaf, item, encryption);
+        btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, int cow)
+{
+        int ret;
+        struct btrfs_key file_key;
+        struct btrfs_key found_key;
+        struct btrfs_csum_item *item;
+        struct extent_buffer *leaf;
+        u64 csum_offset = 0;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        int csums_in_item;
+        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        file_key.offset = bytenr;
+        btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+        ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
+        if (ret < 0)
+                goto fail;
+        leaf = path->nodes[0];
+        if (ret > 0) {
+                ret = 1;
+                if (path->slots[0] == 0)
+                        goto fail;
+                path->slots[0]--;
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
+                        goto fail;
+                csum_offset = (bytenr - found_key.offset) >>
+                                root->fs_info->sb->s_blocksize_bits;
+                csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
+                csums_in_item /= csum_size;
+                if (csum_offset >= csums_in_item) {
+                        ret = -EFBIG;
+                        goto fail;
+                }
+        }
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+        item = (struct btrfs_csum_item *)((unsigned char *)item +
+                                          csum_offset * csum_size);
+        return item;
+fail:
+        if (ret > 0)
+                ret = -ENOENT;
+        return ERR_PTR(ret);
+}
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid,
+                             u64 offset, int mod)
+{
+        int ret;
+        struct btrfs_key file_key;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        file_key.objectid = objectid;
+        file_key.offset = offset;
+        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+        ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+        return ret;
+}
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                          struct bio *bio, u32 *dst)
+{
+        u32 sum;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        int bio_index = 0;
+        u64 offset;
+        u64 item_start_offset = 0;
+        u64 item_last_offset = 0;
+        u64 disk_bytenr;
+        u32 diff;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_csum_item *item = NULL;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        path = btrfs_alloc_path();
+        if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+                path->reada = 2;
+        WARN_ON(bio->bi_vcnt <= 0);
+        disk_bytenr = (u64)bio->bi_sector << 9;
+        while (bio_index < bio->bi_vcnt) {
+                offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+                ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
+                if (ret == 0)
+                        goto found;
+                if (!item || disk_bytenr < item_start_offset ||
+                    disk_bytenr >= item_last_offset) {
+                        struct btrfs_key found_key;
+                        u32 item_size;
+                        if (item)
+                                btrfs_release_path(root, path);
+                        item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+                                                 path, disk_bytenr, 0);
+                        if (IS_ERR(item)) {
+                                ret = PTR_ERR(item);
+                                if (ret == -ENOENT || ret == -EFBIG)
+                                        ret = 0;
+                                sum = 0;
+                                if (BTRFS_I(inode)->root->root_key.objectid ==
+                                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                                        set_extent_bits(io_tree, offset,
+                                                offset + bvec->bv_len - 1,
+                                                EXTENT_NODATASUM, GFP_NOFS);
+                                } else {
+                                        printk(KERN_INFO "btrfs no csum found "
+                                               "for inode %lu start %llu\n",
+                                               inode->i_ino,
+                                               (unsigned long long)offset);
+                                }
+                                item = NULL;
+                                btrfs_release_path(root, path);
+                                goto found;
+                        }
+                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                              path->slots[0]);
+                        item_start_offset = found_key.offset;
+                        item_size = btrfs_item_size_nr(path->nodes[0],
+                                                       path->slots[0]);
+                        item_last_offset = item_start_offset +
+                                (item_size / csum_size) *
+                                root->sectorsize;
+                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                              struct btrfs_csum_item);
+                }
+                /*
+                 * this byte range must be able to fit inside
+                 * a single leaf so it will also fit inside a u32
+                 */
+                diff = disk_bytenr - item_start_offset;
+                diff = diff / root->sectorsize;
+                diff = diff * csum_size;
+                read_extent_buffer(path->nodes[0], &sum,
+                                   ((unsigned long)item) + diff,
+                                   csum_size);
+found:
+                if (dst)
+                        *dst++ = sum;
+                else
+                        set_state_private(io_tree, offset, sum);
+                disk_bytenr += bvec->bv_len;
+                bio_index++;
+                bvec++;
+        }
+        btrfs_free_path(path);
+        return 0;
+}
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+                             struct list_head *list)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_csum_item *item;
+        unsigned long offset;
+        int ret;
+        size_t size;
+        u64 csum_end;
+        u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        key.offset = start;
+        key.type = BTRFS_EXTENT_CSUM_KEY;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto fail;
+        if (ret > 0 && path->slots[0] > 0) {
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+                if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+                    key.type == BTRFS_EXTENT_CSUM_KEY) {
+                        offset = (start - key.offset) >>
+                                 root->fs_info->sb->s_blocksize_bits;
+                        if (offset * csum_size <
+                            btrfs_item_size_nr(leaf, path->slots[0] - 1))
+                                path->slots[0]--;
+                }
+        }
+        while (start <= end) {
+                leaf = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto fail;
+                        if (ret > 0)
+                                break;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+                    key.type != BTRFS_EXTENT_CSUM_KEY)
+                        break;
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.offset > end)
+                        break;
+                if (key.offset > start)
+                        start = key.offset;
+                size = btrfs_item_size_nr(leaf, path->slots[0]);
+                csum_end = key.offset + (size / csum_size) * root->sectorsize;
+                if (csum_end <= start) {
+                        path->slots[0]++;
+                        continue;
+                }
+                csum_end = min(csum_end, end + 1);
+                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                      struct btrfs_csum_item);
+                while (start < csum_end) {
+                        size = min_t(size_t, csum_end - start,
+                                        MAX_ORDERED_SUM_BYTES(root));
+                        sums = kzalloc(btrfs_ordered_sum_size(root, size),
+                                        GFP_NOFS);
+                        BUG_ON(!sums);
+                        sector_sum = sums->sums;
+                        sums->bytenr = start;
+                        sums->len = size;
+                        offset = (start - key.offset) >>
+                                root->fs_info->sb->s_blocksize_bits;
+                        offset *= csum_size;
+                        while (size > 0) {
+                                read_extent_buffer(path->nodes[0],
+                                                &sector_sum->sum,
+                                                ((unsigned long)item) +
+                                                offset, csum_size);
+                                sector_sum->bytenr = start;
+                                size -= root->sectorsize;
+                                start += root->sectorsize;
+                                offset += csum_size;
+                                sector_sum++;
+                        }
+                        list_add_tail(&sums->list, list);
+                }
+                path->slots[0]++;
+        }
+        ret = 0;
+fail:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+                       struct bio *bio, u64 file_start, int contig)
+{
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_ordered_extent *ordered;
+        char *data;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        int bio_index = 0;
+        unsigned long total_bytes = 0;
+        unsigned long this_sum_bytes = 0;
+        u64 offset;
+        u64 disk_bytenr;
+        WARN_ON(bio->bi_vcnt <= 0);
+        sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+        if (!sums)
+                return -ENOMEM;
+        sector_sum = sums->sums;
+        disk_bytenr = (u64)bio->bi_sector << 9;
+        sums->len = bio->bi_size;
+        INIT_LIST_HEAD(&sums->list);
+        if (contig)
+                offset = file_start;
+        else
+                offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+        ordered = btrfs_lookup_ordered_extent(inode, offset);
+        BUG_ON(!ordered);
+        sums->bytenr = ordered->start;
+        while (bio_index < bio->bi_vcnt) {
+                if (!contig)
+                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+                if (!contig && (offset >= ordered->file_offset + ordered->len ||
+                    offset < ordered->file_offset)) {
+                        unsigned long bytes_left;
+                        sums->len = this_sum_bytes;
+                        this_sum_bytes = 0;
+                        btrfs_add_ordered_sum(inode, ordered, sums);
+                        btrfs_put_ordered_extent(ordered);
+                        bytes_left = bio->bi_size - total_bytes;
+                        sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+                                       GFP_NOFS);
+                        BUG_ON(!sums);
+                        sector_sum = sums->sums;
+                        sums->len = bytes_left;
+                        ordered = btrfs_lookup_ordered_extent(inode, offset);
+                        BUG_ON(!ordered);
+                        sums->bytenr = ordered->start;
+                }
+                data = kmap_atomic(bvec->bv_page, KM_USER0);
+                sector_sum->sum = ~(u32)0;
+                sector_sum->sum = btrfs_csum_data(root,
+                                                  data + bvec->bv_offset,
+                                                  sector_sum->sum,
+                                                  bvec->bv_len);
+                kunmap_atomic(data, KM_USER0);
+                btrfs_csum_final(sector_sum->sum,
+                                 (char *)&sector_sum->sum);
+                sector_sum->bytenr = disk_bytenr;
+                sector_sum++;
+                bio_index++;
+                total_bytes += bvec->bv_len;
+                this_sum_bytes += bvec->bv_len;
+                disk_bytenr += bvec->bv_len;
+                offset += bvec->bv_len;
+                bvec++;
+        }
+        this_sum_bytes = 0;
+        btrfs_add_ordered_sum(inode, ordered, sums);
+        btrfs_put_ordered_extent(ordered);
+        return 0;
+}
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct btrfs_key *key,
+                                      u64 bytenr, u64 len)
+{
+        struct extent_buffer *leaf;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        u64 csum_end;
+        u64 end_byte = bytenr + len;
+        u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+        int ret;
+        leaf = path->nodes[0];
+        csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+        csum_end <<= root->fs_info->sb->s_blocksize_bits;
+        csum_end += key->offset;
+        if (key->offset < bytenr && csum_end <= end_byte) {
+                /*
+                 *         [ bytenr - len ]
+                 *         [   ]
+                 *   [csum     ]
+                 *   A simple truncate off the end of the item
+                 */
+                u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+                new_size *= csum_size;
+                ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+                BUG_ON(ret);
+        } else if (key->offset >= bytenr && csum_end > end_byte &&
+                   end_byte > key->offset) {
+                /*
+                 *         [ bytenr - len ]
+                 *                 [ ]
+                 *                 [csum     ]
+                 * we need to truncate from the beginning of the csum
+                 */
+                u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+                new_size *= csum_size;
+                ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+                BUG_ON(ret);
+                key->offset = end_byte;
+                ret = btrfs_set_item_key_safe(trans, root, path, key);
+                BUG_ON(ret);
+        } else {
+                BUG();
+        }
+        return 0;
+}
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, u64 bytenr, u64 len)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        u64 end_byte = bytenr + len;
+        u64 csum_end;
+        struct extent_buffer *leaf;
+        int ret;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+        root = root->fs_info->csum_root;
+        path = btrfs_alloc_path();
+        while (1) {
+                key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+                key.offset = end_byte - 1;
+                key.type = BTRFS_EXTENT_CSUM_KEY;
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                goto out;
+                        path->slots[0]--;
+                }
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+                    key.type != BTRFS_EXTENT_CSUM_KEY) {
+                        break;
+                }
+                if (key.offset >= end_byte)
+                        break;
+                csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+                csum_end <<= blocksize_bits;
+                csum_end += key.offset;
+                /* this csum ends before we start, we're done */
+                if (csum_end <= bytenr)
+                        break;
+                /* delete the entire item, it is inside our range */
+                if (key.offset >= bytenr && csum_end <= end_byte) {
+                        ret = btrfs_del_item(trans, root, path);
+                        BUG_ON(ret);
+                        if (key.offset == bytenr)
+                                break;
+                } else if (key.offset < bytenr && csum_end > end_byte) {
+                        unsigned long offset;
+                        unsigned long shift_len;
+                        unsigned long item_offset;
+                        /*
+                         *        [ bytenr - len ]
+                         *     [csum                ]
+                         *
+                         * Our bytes are in the middle of the csum,
+                         * we need to split this item and insert a new one.
+                         *
+                         * But we can't drop the path because the
+                         * csum could change, get removed, extended etc.
+                         *
+                         * The trick here is the max size of a csum item leaves
+                         * enough room in the tree block for a single
+                         * item header.  So, we split the item in place,
+                         * adding a new header pointing to the existing
+                         * bytes.  Then we loop around again and we have
+                         * a nicely formed csum item that we can neatly
+                         * truncate.
+                         */
+                        offset = (bytenr - key.offset) >> blocksize_bits;
+                        offset *= csum_size;
+                        shift_len = (len >> blocksize_bits) * csum_size;
+                        item_offset = btrfs_item_ptr_offset(leaf,
+                                                            path->slots[0]);
+                        memset_extent_buffer(leaf, 0, item_offset + offset,
+                                             shift_len);
+                        key.offset = bytenr;
+                        /*
+                         * btrfs_split_item returns -EAGAIN when the
+                         * item changed size or key
+                         */
+                        ret = btrfs_split_item(trans, root, path, &key, offset);
+                        BUG_ON(ret && ret != -EAGAIN);
+                        key.offset = end_byte - 1;
+                } else {
+                        ret = truncate_one_csum(trans, root, path,
+                                                &key, bytenr, len);
+                        BUG_ON(ret);
+                        if (key.offset < bytenr)
+                                break;
+                }
+                btrfs_release_path(root, path);
+        }
+out:
+        btrfs_free_path(path);
+        return 0;
+}
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_ordered_sum *sums)
+{
+        u64 bytenr;
+        int ret;
+        struct btrfs_key file_key;
+        struct btrfs_key found_key;
+        u64 next_offset;
+        u64 total_bytes = 0;
+        int found_next;
+        struct btrfs_path *path;
+        struct btrfs_csum_item *item;
+        struct btrfs_csum_item *item_end;
+        struct extent_buffer *leaf = NULL;
+        u64 csum_offset;
+        struct btrfs_sector_sum *sector_sum;
+        u32 nritems;
+        u32 ins_size;
+        char *eb_map;
+        char *eb_token;
+        unsigned long map_len;
+        unsigned long map_start;
+        u16 csum_size =
+                btrfs_super_csum_size(&root->fs_info->super_copy);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        sector_sum = sums->sums;
+again:
+        next_offset = (u64)-1;
+        found_next = 0;
+        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        file_key.offset = sector_sum->bytenr;
+        bytenr = sector_sum->bytenr;
+        btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
+        item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+        if (!IS_ERR(item)) {
+                leaf = path->nodes[0];
+                ret = 0;
+                goto found;
+        }
+        ret = PTR_ERR(item);
+        if (ret == -EFBIG) {
+                u32 item_size;
+                /* we found one, but it isn't big enough yet */
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                if ((item_size / csum_size) >=
+                    MAX_CSUM_ITEMS(root, csum_size)) {
+                        /* already at max size, make a new one */
+                        goto insert;
+                }
+        } else {
+                int slot = path->slots[0] + 1;
+                /* we didn't find a csum item, insert one */
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                if (path->slots[0] >= nritems - 1) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 1)
+                                found_next = 1;
+                        if (ret != 0)
+                                goto insert;
+                        slot = 0;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+                if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+                    found_key.type != BTRFS_EXTENT_CSUM_KEY) {
+                        found_next = 1;
+                        goto insert;
+                }
+                next_offset = found_key.offset;
+                found_next = 1;
+                goto insert;
+        }
+        /*
+         * at this point, we know the tree has an item, but it isn't big
+         * enough yet to put our csum in.  Grow it
+         */
+        btrfs_release_path(root, path);
+        ret = btrfs_search_slot(trans, root, &file_key, path,
+                                csum_size, 1);
+        if (ret < 0)
+                goto fail_unlock;
+        if (ret > 0) {
+                if (path->slots[0] == 0)
+                        goto insert;
+                path->slots[0]--;
+        }
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        csum_offset = (bytenr - found_key.offset) >>
+                        root->fs_info->sb->s_blocksize_bits;
+        if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+            found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+            csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
+                goto insert;
+        }
+        if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+            csum_size) {
+                u32 diff = (csum_offset + 1) * csum_size;
+                /*
+                 * is the item big enough already?  we dropped our lock
+                 * before and need to recheck
+                 */
+                if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
+                        goto csum;
+                diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
+                if (diff != csum_size)
+                        goto insert;
+                ret = btrfs_extend_item(trans, root, path, diff);
+                BUG_ON(ret);
+                goto csum;
+        }
+insert:
+        btrfs_release_path(root, path);
+        csum_offset = 0;
+        if (found_next) {
+                u64 tmp = total_bytes + root->sectorsize;
+                u64 next_sector = sector_sum->bytenr;
+                struct btrfs_sector_sum *next = sector_sum + 1;
+                while (tmp < sums->len) {
+                        if (next_sector + root->sectorsize != next->bytenr)
+                                break;
+                        tmp += root->sectorsize;
+                        next_sector = next->bytenr;
+                        next++;
+                }
+                tmp = min(tmp, next_offset - file_key.offset);
+                tmp >>= root->fs_info->sb->s_blocksize_bits;
+                tmp = max((u64)1, tmp);
+                tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+                ins_size = csum_size * tmp;
+        } else {
+                ins_size = csum_size;
+        }
+        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+                                      ins_size);
+        if (ret < 0)
+                goto fail_unlock;
+        if (ret != 0) {
+                WARN_ON(1);
+                goto fail_unlock;
+        }
+csum:
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+        ret = 0;
+        item = (struct btrfs_csum_item *)((unsigned char *)item +
+                                          csum_offset * csum_size);
+found:
+        item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+        item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
+                                      btrfs_item_size_nr(leaf, path->slots[0]));
+        eb_token = NULL;
+        cond_resched();
+next_sector:
+        if (!eb_token ||
+           (unsigned long)item + csum_size >= map_start + map_len) {
+                int err;
+                if (eb_token)
+                        unmap_extent_buffer(leaf, eb_token, KM_USER1);
+                eb_token = NULL;
+                err = map_private_extent_buffer(leaf, (unsigned long)item,
+                                                csum_size,
+                                                &eb_token, &eb_map,
+                                                &map_start, &map_len, KM_USER1);
+                if (err)
+                        eb_token = NULL;
+        }
+        if (eb_token) {
+                memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
+                       &sector_sum->sum, csum_size);
+        } else {
+                write_extent_buffer(leaf, &sector_sum->sum,
+                                    (unsigned long)item, csum_size);
+        }
+        total_bytes += root->sectorsize;
+        sector_sum++;
+        if (total_bytes < sums->len) {
+                item = (struct btrfs_csum_item *)((char *)item +
+                                                  csum_size);
+                if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
+                    sector_sum->bytenr) {
+                        bytenr = sector_sum->bytenr;
+                        goto next_sector;
+                }
+        }
+        if (eb_token) {
+                unmap_extent_buffer(leaf, eb_token, KM_USER1);
+                eb_token = NULL;
+        }
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        cond_resched();
+        if (total_bytes < sums->len) {
+                btrfs_release_path(root, path);
+                goto again;
+        }
+out:
+        btrfs_free_path(path);
+        return ret;
+fail_unlock:
+        goto out;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 000000000000..90268334145e
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,1288 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
+#include "compat.h"
+/* simple helper to fault in pages and copy.  This should go away
+ * and be replaced with calls into generic code.
+ */
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
+                                         int write_bytes,
+                                         struct page **prepared_pages,
+                                         const char __user *buf)
+{
+        long page_fault = 0;
+        int i;
+        int offset = pos & (PAGE_CACHE_SIZE - 1);
+        for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+                size_t count = min_t(size_t,
+                                     PAGE_CACHE_SIZE - offset, write_bytes);
+                struct page *page = prepared_pages[i];
+                fault_in_pages_readable(buf, count);
+                /* Copy data from userspace to the current page */
+                kmap(page);
+                page_fault = __copy_from_user(page_address(page) + offset,
+                                              buf, count);
+                /* Flush processor's dcache for this page */
+                flush_dcache_page(page);
+                kunmap(page);
+                buf += count;
+                write_bytes -= count;
+                if (page_fault)
+                        break;
+        }
+        return page_fault ? -EFAULT : 0;
+}
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
+static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+        size_t i;
+        for (i = 0; i < num_pages; i++) {
+                if (!pages[i])
+                        break;
+                /* page checked is some magic around finding pages that
+                 * have been modified without going through btrfs_set_page_dirty
+                 * clear it here
+                 */
+                ClearPageChecked(pages[i]);
+                unlock_page(pages[i]);
+                mark_page_accessed(pages[i]);
+                page_cache_release(pages[i]);
+        }
+}
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
+static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct file *file,
+                                   struct page **pages,
+                                   size_t num_pages,
+                                   loff_t pos,
+                                   size_t write_bytes)
+{
+        int err = 0;
+        int i;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        u64 hint_byte;
+        u64 num_bytes;
+        u64 start_pos;
+        u64 end_of_last_block;
+        u64 end_pos = pos + write_bytes;
+        loff_t isize = i_size_read(inode);
+        start_pos = pos & ~((u64)root->sectorsize - 1);
+        num_bytes = (write_bytes + pos - start_pos +
+                    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+        end_of_last_block = start_pos + num_bytes - 1;
+        lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+        trans = btrfs_join_transaction(root, 1);
+        if (!trans) {
+                err = -ENOMEM;
+                goto out_unlock;
+        }
+        btrfs_set_trans_block_group(trans, inode);
+        hint_byte = 0;
+        set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+        /* check for reserved extents on each page, we don't want
+         * to reset the delalloc bit on things that already have
+         * extents reserved.
+         */
+        btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+        for (i = 0; i < num_pages; i++) {
+                struct page *p = pages[i];
+                SetPageUptodate(p);
+                ClearPageChecked(p);
+                set_page_dirty(p);
+        }
+        if (end_pos > isize) {
+                i_size_write(inode, end_pos);
+                btrfs_update_inode(trans, root, inode);
+        }
+        err = btrfs_end_transaction(trans, root);
+out_unlock:
+        unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
+        return err;
+}
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end].  Existing extents are split as required.
+ */
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+                            int skip_pinned)
+{
+        struct extent_map *em;
+        struct extent_map *split = NULL;
+        struct extent_map *split2 = NULL;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        u64 len = end - start + 1;
+        int ret;
+        int testend = 1;
+        unsigned long flags;
+        int compressed = 0;
+        WARN_ON(end < start);
+        if (end == (u64)-1) {
+                len = (u64)-1;
+                testend = 0;
+        }
+        while (1) {
+                if (!split)
+                        split = alloc_extent_map(GFP_NOFS);
+                if (!split2)
+                        split2 = alloc_extent_map(GFP_NOFS);
+                spin_lock(&em_tree->lock);
+                em = lookup_extent_mapping(em_tree, start, len);
+                if (!em) {
+                        spin_unlock(&em_tree->lock);
+                        break;
+                }
+                flags = em->flags;
+                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+                        spin_unlock(&em_tree->lock);
+                        if (em->start <= start &&
+                            (!testend || em->start + em->len >= start + len)) {
+                                free_extent_map(em);
+                                break;
+                        }
+                        if (start < em->start) {
+                                len = em->start - start;
+                        } else {
+                                len = start + len - (em->start + em->len);
+                                start = em->start + em->len;
+                        }
+                        free_extent_map(em);
+                        continue;
+                }
+                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+                remove_extent_mapping(em_tree, em);
+                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+                    em->start < start) {
+                        split->start = em->start;
+                        split->len = start - em->start;
+                        split->orig_start = em->orig_start;
+                        split->block_start = em->block_start;
+                        if (compressed)
+                                split->block_len = em->block_len;
+                        else
+                                split->block_len = split->len;
+                        split->bdev = em->bdev;
+                        split->flags = flags;
+                        ret = add_extent_mapping(em_tree, split);
+                        BUG_ON(ret);
+                        free_extent_map(split);
+                        split = split2;
+                        split2 = NULL;
+                }
+                if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+                    testend && em->start + em->len > start + len) {
+                        u64 diff = start + len - em->start;
+                        split->start = start + len;
+                        split->len = em->start + em->len - (start + len);
+                        split->bdev = em->bdev;
+                        split->flags = flags;
+                        if (compressed) {
+                                split->block_len = em->block_len;
+                                split->block_start = em->block_start;
+                                split->orig_start = em->orig_start;
+                        } else {
+                                split->block_len = split->len;
+                                split->block_start = em->block_start + diff;
+                                split->orig_start = split->start;
+                        }
+                        ret = add_extent_mapping(em_tree, split);
+                        BUG_ON(ret);
+                        free_extent_map(split);
+                        split = NULL;
+                }
+                spin_unlock(&em_tree->lock);
+                /* once for us */
+                free_extent_map(em);
+                /* once for the tree*/
+                free_extent_map(em);
+        }
+        if (split)
+                free_extent_map(split);
+        if (split2)
+                free_extent_map(split2);
+        return 0;
+}
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
+{
+        return 0;
+#if 0
+        struct btrfs_path *path;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *extent;
+        u64 last_offset = 0;
+        int nritems;
+        int slot;
+        int found_type;
+        int ret;
+        int err = 0;
+        u64 extent_end = 0;
+        path = btrfs_alloc_path();
+        ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
+                                       last_offset, 0);
+        while (1) {
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret)
+                                goto out;
+                        nritems = btrfs_header_nritems(path->nodes[0]);
+                }
+                slot = path->slots[0];
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (found_key.objectid != inode->i_ino)
+                        break;
+                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+                        goto out;
+                if (found_key.offset < last_offset) {
+                        WARN_ON(1);
+                        btrfs_print_leaf(root, leaf);
+                        printk(KERN_ERR "inode %lu found offset %llu "
+                               "expected %llu\n", inode->i_ino,
+                               (unsigned long long)found_key.offset,
+                               (unsigned long long)last_offset);
+                        err = 1;
+                        goto out;
+                }
+                extent = btrfs_item_ptr(leaf, slot,
+                                        struct btrfs_file_extent_item);
+                found_type = btrfs_file_extent_type(leaf, extent);
+                if (found_type == BTRFS_FILE_EXTENT_REG) {
+                        extent_end = found_key.offset +
+                             btrfs_file_extent_num_bytes(leaf, extent);
+                } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                        struct btrfs_item *item;
+                        item = btrfs_item_nr(leaf, slot);
+                        extent_end = found_key.offset +
+                             btrfs_file_extent_inline_len(leaf, extent);
+                        extent_end = (extent_end + root->sectorsize - 1) &
+                                ~((u64)root->sectorsize - 1);
+                }
+                last_offset = extent_end;
+                path->slots[0]++;
+        }
+        if (0 && last_offset < inode->i_size) {
+                WARN_ON(1);
+                btrfs_print_leaf(root, leaf);
+                printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
+                       inode->i_ino, (unsigned long long)last_offset,
+                       (unsigned long long)inode->i_size);
+                err = 1;
+        }
+out:
+        btrfs_free_path(path);
+        return err;
+#endif
+}
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end.  hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split.  Anything entirely inside the range
+ * is deleted from the tree.
+ *
+ * inline_limit is used to tell this code which offsets in the file to keep
+ * if they contain inline extents.
+ */
+noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root, struct inode *inode,
+                       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
+{
+        u64 extent_end = 0;
+        u64 locked_end = end;
+        u64 search_start = start;
+        u64 leaf_start;
+        u64 ram_bytes = 0;
+        u64 orig_parent = 0;
+        u64 disk_bytenr = 0;
+        u8 compression;
+        u8 encryption;
+        u16 other_encoding = 0;
+        u64 root_gen;
+        u64 root_owner;
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *extent;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_file_extent_item old;
+        int keep;
+        int slot;
+        int bookend;
+        int found_type = 0;
+        int found_extent;
+        int found_inline;
+        int recow;
+        int ret;
+        inline_limit = 0;
+        btrfs_drop_extent_cache(inode, start, end - 1, 0);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        while (1) {
+                recow = 0;
+                btrfs_release_path(root, path);
+                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                               search_start, -1);
+                if (ret < 0)
+                        goto out;
+                if (ret > 0) {
+                        if (path->slots[0] == 0) {
+                                ret = 0;
+                                goto out;
+                        }
+                        path->slots[0]--;
+                }
+next_slot:
+                keep = 0;
+                bookend = 0;
+                found_extent = 0;
+                found_inline = 0;
+                leaf_start = 0;
+                root_gen = 0;
+                root_owner = 0;
+                compression = 0;
+                encryption = 0;
+                extent = NULL;
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                ret = 0;
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
+                    key.offset >= end) {
+                        goto out;
+                }
+                if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+                    key.objectid != inode->i_ino) {
+                        goto out;
+                }
+                if (recow) {
+                        search_start = max(key.offset, start);
+                        continue;
+                }
+                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+                        extent = btrfs_item_ptr(leaf, slot,
+                                                struct btrfs_file_extent_item);
+                        found_type = btrfs_file_extent_type(leaf, extent);
+                        compression = btrfs_file_extent_compression(leaf,
+                                                                    extent);
+                        encryption = btrfs_file_extent_encryption(leaf,
+                                                                  extent);
+                        other_encoding = btrfs_file_extent_other_encoding(leaf,
+                                                                  extent);
+                        if (found_type == BTRFS_FILE_EXTENT_REG ||
+                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                extent_end =
+                                     btrfs_file_extent_disk_bytenr(leaf,
+                                                                   extent);
+                                if (extent_end)
+                                        *hint_byte = extent_end;
+                                extent_end = key.offset +
+                                     btrfs_file_extent_num_bytes(leaf, extent);
+                                ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+                                                                extent);
+                                found_extent = 1;
+                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                                found_inline = 1;
+                                extent_end = key.offset +
+                                     btrfs_file_extent_inline_len(leaf, extent);
+                        }
+                } else {
+                        extent_end = search_start;
+                }
+                /* we found nothing we can drop */
+                if ((!found_extent && !found_inline) ||
+                    search_start >= extent_end) {
+                        int nextret;
+                        u32 nritems;
+                        nritems = btrfs_header_nritems(leaf);
+                        if (slot >= nritems - 1) {
+                                nextret = btrfs_next_leaf(root, path);
+                                if (nextret)
+                                        goto out;
+                                recow = 1;
+                        } else {
+                                path->slots[0]++;
+                        }
+                        goto next_slot;
+                }
+                if (end <= extent_end && start >= key.offset && found_inline)
+                        *hint_byte = EXTENT_MAP_INLINE;
+                if (found_extent) {
+                        read_extent_buffer(leaf, &old, (unsigned long)extent,
+                                           sizeof(old));
+                        root_gen = btrfs_header_generation(leaf);
+                        root_owner = btrfs_header_owner(leaf);
+                        leaf_start = leaf->start;
+                }
+                if (end < extent_end && end >= key.offset) {
+                        bookend = 1;
+                        if (found_inline && start <= key.offset)
+                                keep = 1;
+                }
+                if (bookend && found_extent) {
+                        if (locked_end < extent_end) {
+                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                                                locked_end, extent_end - 1,
+                                                GFP_NOFS);
+                                if (!ret) {
+                                        btrfs_release_path(root, path);
+                                        lock_extent(&BTRFS_I(inode)->io_tree,
+                                                locked_end, extent_end - 1,
+                                                GFP_NOFS);
+                                        locked_end = extent_end;
+                                        continue;
+                                }
+                                locked_end = extent_end;
+                        }
+                        orig_parent = path->nodes[0]->start;
+                        disk_bytenr = le64_to_cpu(old.disk_bytenr);
+                        if (disk_bytenr != 0) {
+                                ret = btrfs_inc_extent_ref(trans, root,
+                                           disk_bytenr,
+                                           le64_to_cpu(old.disk_num_bytes),
+                                           orig_parent, root->root_key.objectid,
+                                           trans->transid, inode->i_ino);
+                                BUG_ON(ret);
+                        }
+                }
+                if (found_inline) {
+                        u64 mask = root->sectorsize - 1;
+                        search_start = (extent_end + mask) & ~mask;
+                } else
+                        search_start = extent_end;
+                /* truncate existing extent */
+                if (start > key.offset) {
+                        u64 new_num;
+                        u64 old_num;
+                        keep = 1;
+                        WARN_ON(start & (root->sectorsize - 1));
+                        if (found_extent) {
+                                new_num = start - key.offset;
+                                old_num = btrfs_file_extent_num_bytes(leaf,
+                                                                      extent);
+                                *hint_byte =
+                                        btrfs_file_extent_disk_bytenr(leaf,
+                                                                      extent);
+                                if (btrfs_file_extent_disk_bytenr(leaf,
+                                                                  extent)) {
+                                        inode_sub_bytes(inode, old_num -
+                                                        new_num);
+                                }
+                                btrfs_set_file_extent_num_bytes(leaf,
+                                                        extent, new_num);
+                                btrfs_mark_buffer_dirty(leaf);
+                        } else if (key.offset < inline_limit &&
+                                   (end > extent_end) &&
+                                   (inline_limit < extent_end)) {
+                                u32 new_size;
+                                new_size = btrfs_file_extent_calc_inline_size(
+                                                   inline_limit - key.offset);
+                                inode_sub_bytes(inode, extent_end -
+                                                inline_limit);
+                                btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                        new_size);
+                                if (!compression && !encryption) {
+                                        btrfs_truncate_item(trans, root, path,
+                                                            new_size, 1);
+                                }
+                        }
+                }
+                /* delete the entire extent */
+                if (!keep) {
+                        if (found_inline)
+                                inode_sub_bytes(inode, extent_end -
+                                                key.offset);
+                        ret = btrfs_del_item(trans, root, path);
+                        /* TODO update progress marker and return */
+                        BUG_ON(ret);
+                        extent = NULL;
+                        btrfs_release_path(root, path);
+                        /* the extent will be freed later */
+                }
+                if (bookend && found_inline && start <= key.offset) {
+                        u32 new_size;
+                        new_size = btrfs_file_extent_calc_inline_size(
+                                                   extent_end - end);
+                        inode_sub_bytes(inode, end - key.offset);
+                        btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                        new_size);
+                        if (!compression && !encryption)
+                                ret = btrfs_truncate_item(trans, root, path,
+                                                          new_size, 0);
+                        BUG_ON(ret);
+                }
+                /* create bookend, splitting the extent in two */
+                if (bookend && found_extent) {
+                        struct btrfs_key ins;
+                        ins.objectid = inode->i_ino;
+                        ins.offset = end;
+                        btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+                        btrfs_release_path(root, path);
+                        ret = btrfs_insert_empty_item(trans, root, path, &ins,
+                                                      sizeof(*extent));
+                        BUG_ON(ret);
+                        leaf = path->nodes[0];
+                        extent = btrfs_item_ptr(leaf, path->slots[0],
+                                                struct btrfs_file_extent_item);
+                        write_extent_buffer(leaf, &old,
+                                            (unsigned long)extent, sizeof(old));
+                        btrfs_set_file_extent_compression(leaf, extent,
+                                                          compression);
+                        btrfs_set_file_extent_encryption(leaf, extent,
+                                                         encryption);
+                        btrfs_set_file_extent_other_encoding(leaf, extent,
+                                                             other_encoding);
+                        btrfs_set_file_extent_offset(leaf, extent,
+                                    le64_to_cpu(old.offset) + end - key.offset);
+                        WARN_ON(le64_to_cpu(old.num_bytes) <
+                                (extent_end - end));
+                        btrfs_set_file_extent_num_bytes(leaf, extent,
+                                                        extent_end - end);
+                        /*
+                         * set the ram bytes to the size of the full extent
+                         * before splitting.  This is a worst case flag,
+                         * but its the best we can do because we don't know
+                         * how splitting affects compression
+                         */
+                        btrfs_set_file_extent_ram_bytes(leaf, extent,
+                                                        ram_bytes);
+                        btrfs_set_file_extent_type(leaf, extent, found_type);
+                        btrfs_mark_buffer_dirty(path->nodes[0]);
+                        if (disk_bytenr != 0) {
+                                ret = btrfs_update_extent_ref(trans, root,
+                                                disk_bytenr, orig_parent,
+                                                leaf->start,
+                                                root->root_key.objectid,
+                                                trans->transid, ins.objectid);
+                                BUG_ON(ret);
+                        }
+                        btrfs_release_path(root, path);
+                        if (disk_bytenr != 0)
+                                inode_add_bytes(inode, extent_end - end);
+                }
+                if (found_extent && !keep) {
+                        u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
+                        if (old_disk_bytenr != 0) {
+                                inode_sub_bytes(inode,
+                                                le64_to_cpu(old.num_bytes));
+                                ret = btrfs_free_extent(trans, root,
+                                                old_disk_bytenr,
+                                                le64_to_cpu(old.disk_num_bytes),
+                                                leaf_start, root_owner,
+                                                root_gen, key.objectid, 0);
+                                BUG_ON(ret);
+                                *hint_byte = old_disk_bytenr;
+                        }
+                }
+                if (search_start >= end) {
+                        ret = 0;
+                        goto out;
+                }
+        }
+out:
+        btrfs_free_path(path);
+        if (locked_end > end) {
+                unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+                              GFP_NOFS);
+        }
+        btrfs_check_file(root, inode);
+        return ret;
+}
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+                            u64 objectid, u64 bytenr, u64 *start, u64 *end)
+{
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 extent_end;
+        if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+                return 0;
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+                return 0;
+        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+            btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+            btrfs_file_extent_compression(leaf, fi) ||
+            btrfs_file_extent_encryption(leaf, fi) ||
+            btrfs_file_extent_other_encoding(leaf, fi))
+                return 0;
+        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+        if ((*start && *start != key.offset) || (*end && *end != extent_end))
+                return 0;
+        *start = key.offset;
+        *end = extent_end;
+        return 1;
+}
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct inode *inode, u64 start, u64 end)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 bytenr;
+        u64 num_bytes;
+        u64 extent_end;
+        u64 extent_offset;
+        u64 other_start;
+        u64 other_end;
+        u64 split = start;
+        u64 locked_end = end;
+        u64 orig_parent;
+        int extent_type;
+        int split_end = 1;
+        int ret;
+        btrfs_drop_extent_cache(inode, start, end - 1, 0);
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+again:
+        key.objectid = inode->i_ino;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        if (split == start)
+                key.offset = split;
+        else
+                key.offset = split - 1;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0 && path->slots[0] > 0)
+                path->slots[0]--;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        BUG_ON(key.objectid != inode->i_ino ||
+               key.type != BTRFS_EXTENT_DATA_KEY);
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        extent_type = btrfs_file_extent_type(leaf, fi);
+        BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+        BUG_ON(key.offset > start || extent_end < end);
+        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+        extent_offset = btrfs_file_extent_offset(leaf, fi);
+        if (key.offset == start)
+                split = end;
+        if (key.offset == start && extent_end == end) {
+                int del_nr = 0;
+                int del_slot = 0;
+                u64 leaf_owner = btrfs_header_owner(leaf);
+                u64 leaf_gen = btrfs_header_generation(leaf);
+                other_start = end;
+                other_end = 0;
+                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+                                     bytenr, &other_start, &other_end)) {
+                        extent_end = other_end;
+                        del_slot = path->slots[0] + 1;
+                        del_nr++;
+                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                                leaf->start, leaf_owner,
+                                                leaf_gen, inode->i_ino, 0);
+                        BUG_ON(ret);
+                }
+                other_start = 0;
+                other_end = start;
+                if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+                                     bytenr, &other_start, &other_end)) {
+                        key.offset = other_start;
+                        del_slot = path->slots[0];
+                        del_nr++;
+                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                                leaf->start, leaf_owner,
+                                                leaf_gen, inode->i_ino, 0);
+                        BUG_ON(ret);
+                }
+                split_end = 0;
+                if (del_nr == 0) {
+                        btrfs_set_file_extent_type(leaf, fi,
+                                                   BTRFS_FILE_EXTENT_REG);
+                        goto done;
+                }
+                fi = btrfs_item_ptr(leaf, del_slot - 1,
+                                    struct btrfs_file_extent_item);
+                btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+                btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                extent_end - key.offset);
+                btrfs_mark_buffer_dirty(leaf);
+                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+                BUG_ON(ret);
+                goto done;
+        } else if (split == start) {
+                if (locked_end < extent_end) {
+                        ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                                        locked_end, extent_end - 1, GFP_NOFS);
+                        if (!ret) {
+                                btrfs_release_path(root, path);
+                                lock_extent(&BTRFS_I(inode)->io_tree,
+                                        locked_end, extent_end - 1, GFP_NOFS);
+                                locked_end = extent_end;
+                                goto again;
+                        }
+                        locked_end = extent_end;
+                }
+                btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
+                extent_offset += split - key.offset;
+        } else  {
+                BUG_ON(key.offset != start);
+                btrfs_set_file_extent_offset(leaf, fi, extent_offset +
+                                             split - key.offset);
+                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
+                key.offset = split;
+                btrfs_set_item_key_safe(trans, root, path, &key);
+                extent_end = split;
+        }
+        if (extent_end == end) {
+                split_end = 0;
+                extent_type = BTRFS_FILE_EXTENT_REG;
+        }
+        if (extent_end == end && split == start) {
+                other_start = end;
+                other_end = 0;
+                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+                                     bytenr, &other_start, &other_end)) {
+                        path->slots[0]++;
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        key.offset = split;
+                        btrfs_set_item_key_safe(trans, root, path, &key);
+                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        other_end - split);
+                        goto done;
+                }
+        }
+        if (extent_end == end && split == end) {
+                other_start = 0;
+                other_end = start;
+                if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
+                                     bytenr, &other_start, &other_end)) {
+                        path->slots[0]--;
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
+                                                        other_start);
+                        goto done;
+                }
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        orig_parent = leaf->start;
+        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+                                   orig_parent, root->root_key.objectid,
+                                   trans->transid, inode->i_ino);
+        BUG_ON(ret);
+        btrfs_release_path(root, path);
+        key.offset = start;
+        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+        btrfs_set_file_extent_type(leaf, fi, extent_type);
+        btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
+        btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
+        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
+        btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+        btrfs_set_file_extent_compression(leaf, fi, 0);
+        btrfs_set_file_extent_encryption(leaf, fi, 0);
+        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+        if (orig_parent != leaf->start) {
+                ret = btrfs_update_extent_ref(trans, root, bytenr,
+                                              orig_parent, leaf->start,
+                                              root->root_key.objectid,
+                                              trans->transid, inode->i_ino);
+                BUG_ON(ret);
+        }
+done:
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+        if (split_end && split == start) {
+                split = end;
+                goto again;
+        }
+        if (locked_end > end) {
+                unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+                              GFP_NOFS);
+        }
+        btrfs_free_path(path);
+        return 0;
+}
+/*
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
+ */
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
+                         struct page **pages, size_t num_pages,
+                         loff_t pos, unsigned long first_index,
+                         unsigned long last_index, size_t write_bytes)
+{
+        int i;
+        unsigned long index = pos >> PAGE_CACHE_SHIFT;
+        struct inode *inode = fdentry(file)->d_inode;
+        int err = 0;
+        u64 start_pos;
+        u64 last_pos;
+        start_pos = pos & ~((u64)root->sectorsize - 1);
+        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+        if (start_pos > inode->i_size) {
+                err = btrfs_cont_expand(inode, start_pos);
+                if (err)
+                        return err;
+        }
+        memset(pages, 0, num_pages * sizeof(struct page *));
+again:
+        for (i = 0; i < num_pages; i++) {
+                pages[i] = grab_cache_page(inode->i_mapping, index + i);
+                if (!pages[i]) {
+                        err = -ENOMEM;
+                        BUG_ON(1);
+                }
+                wait_on_page_writeback(pages[i]);
+        }
+        if (start_pos < inode->i_size) {
+                struct btrfs_ordered_extent *ordered;
+                lock_extent(&BTRFS_I(inode)->io_tree,
+                            start_pos, last_pos - 1, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            last_pos - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > start_pos &&
+                    ordered->file_offset < last_pos) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent(&BTRFS_I(inode)->io_tree,
+                                      start_pos, last_pos - 1, GFP_NOFS);
+                        for (i = 0; i < num_pages; i++) {
+                                unlock_page(pages[i]);
+                                page_cache_release(pages[i]);
+                        }
+                        btrfs_wait_ordered_range(inode, start_pos,
+                                                 last_pos - start_pos);
+                        goto again;
+                }
+                if (ordered)
+                        btrfs_put_ordered_extent(ordered);
+                clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+                                  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+                                  GFP_NOFS);
+                unlock_extent(&BTRFS_I(inode)->io_tree,
+                              start_pos, last_pos - 1, GFP_NOFS);
+        }
+        for (i = 0; i < num_pages; i++) {
+                clear_page_dirty_for_io(pages[i]);
+                set_page_extent_mapped(pages[i]);
+                WARN_ON(!PageLocked(pages[i]));
+        }
+        return 0;
+}
+static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+        loff_t pos;
+        loff_t start_pos;
+        ssize_t num_written = 0;
+        ssize_t err = 0;
+        int ret = 0;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct page **pages = NULL;
+        int nrptrs;
+        struct page *pinned[2];
+        unsigned long first_index;
+        unsigned long last_index;
+        int will_write;
+        will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+                      (file->f_flags & O_DIRECT));
+        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
+                     PAGE_CACHE_SIZE / (sizeof(struct page *)));
+        pinned[0] = NULL;
+        pinned[1] = NULL;
+        pos = *ppos;
+        start_pos = pos;
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        current->backing_dev_info = inode->i_mapping->backing_dev_info;
+        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+        if (err)
+                goto out_nolock;
+        if (count == 0)
+                goto out_nolock;
+        err = file_remove_suid(file);
+        if (err)
+                goto out_nolock;
+        file_update_time(file);
+        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+        mutex_lock(&inode->i_mutex);
+        BTRFS_I(inode)->sequence++;
+        first_index = pos >> PAGE_CACHE_SHIFT;
+        last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+        /*
+         * there are lots of better ways to do this, but this code
+         * makes sure the first and last page in the file range are
+         * up to date and ready for cow
+         */
+        if ((pos & (PAGE_CACHE_SIZE - 1))) {
+                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
+                if (!PageUptodate(pinned[0])) {
+                        ret = btrfs_readpage(NULL, pinned[0]);
+                        BUG_ON(ret);
+                        wait_on_page_locked(pinned[0]);
+                } else {
+                        unlock_page(pinned[0]);
+                }
+        }
+        if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
+                if (!PageUptodate(pinned[1])) {
+                        ret = btrfs_readpage(NULL, pinned[1]);
+                        BUG_ON(ret);
+                        wait_on_page_locked(pinned[1]);
+                } else {
+                        unlock_page(pinned[1]);
+                }
+        }
+        while (count > 0) {
+                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+                size_t write_bytes = min(count, nrptrs *
+                                        (size_t)PAGE_CACHE_SIZE -
+                                         offset);
+                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+                                        PAGE_CACHE_SHIFT;
+                WARN_ON(num_pages > nrptrs);
+                memset(pages, 0, sizeof(struct page *) * nrptrs);
+                ret = btrfs_check_free_space(root, write_bytes, 0);
+                if (ret)
+                        goto out;
+                ret = prepare_pages(root, file, pages, num_pages,
+                                    pos, first_index, last_index,
+                                    write_bytes);
+                if (ret)
+                        goto out;
+                ret = btrfs_copy_from_user(pos, num_pages,
+                                           write_bytes, pages, buf);
+                if (ret) {
+                        btrfs_drop_pages(pages, num_pages);
+                        goto out;
+                }
+                ret = dirty_and_release_pages(NULL, root, file, pages,
+                                              num_pages, pos, write_bytes);
+                btrfs_drop_pages(pages, num_pages);
+                if (ret)
+                        goto out;
+                if (will_write) {
+                        btrfs_fdatawrite_range(inode->i_mapping, pos,
+                                               pos + write_bytes - 1,
+                                               WB_SYNC_NONE);
+                } else {
+                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                                                           num_pages);
+                        if (num_pages <
+                            (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                                btrfs_btree_balance_dirty(root, 1);
+                        btrfs_throttle(root);
+                }
+                buf += write_bytes;
+                count -= write_bytes;
+                pos += write_bytes;
+                num_written += write_bytes;
+                cond_resched();
+        }
+out:
+        mutex_unlock(&inode->i_mutex);
+out_nolock:
+        kfree(pages);
+        if (pinned[0])
+                page_cache_release(pinned[0]);
+        if (pinned[1])
+                page_cache_release(pinned[1]);
+        *ppos = pos;
+        if (num_written > 0 && will_write) {
+                struct btrfs_trans_handle *trans;
+                err = btrfs_wait_ordered_range(inode, start_pos, num_written);
+                if (err)
+                        num_written = err;
+                if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+                        trans = btrfs_start_transaction(root, 1);
+                        ret = btrfs_log_dentry_safe(trans, root,
+                                                    file->f_dentry);
+                        if (ret == 0) {
+                                btrfs_sync_log(trans, root);
+                                btrfs_end_transaction(trans, root);
+                        } else {
+                                btrfs_commit_transaction(trans, root);
+                        }
+                }
+                if (file->f_flags & O_DIRECT) {
+                        invalidate_mapping_pages(inode->i_mapping,
+                              start_pos >> PAGE_CACHE_SHIFT,
+                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
+                }
+        }
+        current->backing_dev_info = NULL;
+        return num_written ? num_written : err;
+}
+int btrfs_release_file(struct inode *inode, struct file *filp)
+{
+        if (filp->private_data)
+                btrfs_ioctl_trans_end(filp);
+        return 0;
+}
+/*
+ * fsync call for both files and directories.  This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit.  This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        struct btrfs_trans_handle *trans;
+        /*
+         * check the transaction that last modified this inode
+         * and see if its already been committed
+         */
+        if (!BTRFS_I(inode)->last_trans)
+                goto out;
+        mutex_lock(&root->fs_info->trans_mutex);
+        if (BTRFS_I(inode)->last_trans <=
+            root->fs_info->last_trans_committed) {
+                BTRFS_I(inode)->last_trans = 0;
+                mutex_unlock(&root->fs_info->trans_mutex);
+                goto out;
+        }
+        mutex_unlock(&root->fs_info->trans_mutex);
+        root->fs_info->tree_log_batch++;
+        filemap_fdatawrite(inode->i_mapping);
+        btrfs_wait_ordered_range(inode, 0, (u64)-1);
+        root->fs_info->tree_log_batch++;
+        /*
+         * ok we haven't committed the transaction yet, lets do a commit
+         */
+        if (file->private_data)
+                btrfs_ioctl_trans_end(file);
+        trans = btrfs_start_transaction(root, 1);
+        if (!trans) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+        if (ret < 0)
+                goto out;
+        /* we've logged all the items and now have a consistent
+         * version of the file in the log.  It is possible that
+         * someone will come in and modify the file, but that's
+         * fine because the log is consistent on disk, and we
+         * have references to all of the file's extents
+         *
+         * It is possible that someone will come in and log the
+         * file again, but that will end up using the synchronization
+         * inside btrfs_sync_log to keep things safe.
+         */
+        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+        if (ret > 0) {
+                ret = btrfs_commit_transaction(trans, root);
+        } else {
+                btrfs_sync_log(trans, root);
+                ret = btrfs_end_transaction(trans, root);
+        }
+        mutex_lock(&file->f_dentry->d_inode->i_mutex);
+out:
+        return ret > 0 ? EIO : ret;
+}
+static struct vm_operations_struct btrfs_file_vm_ops = {
+        .fault          = filemap_fault,
+        .page_mkwrite   = btrfs_page_mkwrite,
+};
+static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
+{
+        vma->vm_ops = &btrfs_file_vm_ops;
+        file_accessed(filp);
+        return 0;
+}
+struct file_operations btrfs_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = do_sync_read,
+        .aio_read       = generic_file_aio_read,
+        .splice_read    = generic_file_splice_read,
+        .write          = btrfs_file_write,
+        .mmap           = btrfs_file_mmap,
+        .open           = generic_file_open,
+        .release        = btrfs_release_file,
+        .fsync          = btrfs_sync_file,
+        .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = btrfs_ioctl,
+#endif
+};
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..d1e5f0e84c58
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,495 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+                              struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct btrfs_free_space *info;
+        while (*p) {
+                parent = *p;
+                info = rb_entry(parent, struct btrfs_free_space, offset_index);
+                if (offset < info->offset)
+                        p = &(*p)->rb_left;
+                else if (offset > info->offset)
+                        p = &(*p)->rb_right;
+                else
+                        return -EEXIST;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return 0;
+}
+static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+                             struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct btrfs_free_space *info;
+        while (*p) {
+                parent = *p;
+                info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+                if (bytes < info->bytes)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return 0;
+}
+/*
+ * searches the tree for the given offset.  If contains is set we will return
+ * the free space that contains the given offset.  If contains is not set we
+ * will return the free space that starts at or after the given offset and is
+ * at least bytes long.
+ */
+static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+                                                   u64 offset, u64 bytes,
+                                                   int contains)
+{
+        struct rb_node *n = root->rb_node;
+        struct btrfs_free_space *entry, *ret = NULL;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                if (offset < entry->offset) {
+                        if (!contains &&
+                            (!ret || entry->offset < ret->offset) &&
+                            (bytes <= entry->bytes))
+                                ret = entry;
+                        n = n->rb_left;
+                } else if (offset > entry->offset) {
+                        if ((entry->offset + entry->bytes - 1) >= offset &&
+                            bytes <= entry->bytes) {
+                                ret = entry;
+                                break;
+                        }
+                        n = n->rb_right;
+                } else {
+                        if (bytes > entry->bytes) {
+                                n = n->rb_right;
+                                continue;
+                        }
+                        ret = entry;
+                        break;
+                }
+        }
+        return ret;
+}
+/*
+ * return a chunk at least bytes size, as close to offset that we can get.
+ */
+static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
+                                                  u64 offset, u64 bytes)
+{
+        struct rb_node *n = root->rb_node;
+        struct btrfs_free_space *entry, *ret = NULL;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+                if (bytes < entry->bytes) {
+                        /*
+                         * We prefer to get a hole size as close to the size we
+                         * are asking for so we don't take small slivers out of
+                         * huge holes, but we also want to get as close to the
+                         * offset as possible so we don't have a whole lot of
+                         * fragmentation.
+                         */
+                        if (offset <= entry->offset) {
+                                if (!ret)
+                                        ret = entry;
+                                else if (entry->bytes < ret->bytes)
+                                        ret = entry;
+                                else if (entry->offset < ret->offset)
+                                        ret = entry;
+                        }
+                        n = n->rb_left;
+                } else if (bytes > entry->bytes) {
+                        n = n->rb_right;
+                } else {
+                        /*
+                         * Ok we may have multiple chunks of the wanted size,
+                         * so we don't want to take the first one we find, we
+                         * want to take the one closest to our given offset, so
+                         * keep searching just in case theres a better match.
+                         */
+                        n = n->rb_right;
+                        if (offset > entry->offset)
+                                continue;
+                        else if (!ret || entry->offset < ret->offset)
+                                ret = entry;
+                }
+        }
+        return ret;
+}
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info)
+{
+        rb_erase(&info->offset_index, &block_group->free_space_offset);
+        rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+}
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_free_space *info)
+{
+        int ret = 0;
+        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+                                 &info->offset_index);
+        if (ret)
+                return ret;
+        ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+                                &info->bytes_index);
+        if (ret)
+                return ret;
+        return ret;
+}
+static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                                  u64 offset, u64 bytes)
+{
+        struct btrfs_free_space *right_info;
+        struct btrfs_free_space *left_info;
+        struct btrfs_free_space *info = NULL;
+        struct btrfs_free_space *alloc_info;
+        int ret = 0;
+        alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+        if (!alloc_info)
+                return -ENOMEM;
+        /*
+         * first we want to see if there is free space adjacent to the range we
+         * are adding, if there is remove that struct and add a new one to
+         * cover the entire range
+         */
+        right_info = tree_search_offset(&block_group->free_space_offset,
+                                        offset+bytes, 0, 1);
+        left_info = tree_search_offset(&block_group->free_space_offset,
+                                       offset-1, 0, 1);
+        if (right_info && right_info->offset == offset+bytes) {
+                unlink_free_space(block_group, right_info);
+                info = right_info;
+                info->offset = offset;
+                info->bytes += bytes;
+        } else if (right_info && right_info->offset != offset+bytes) {
+                printk(KERN_ERR "btrfs adding space in the middle of an "
+                       "existing free space area. existing: "
+                       "offset=%llu, bytes=%llu. new: offset=%llu, "
+                       "bytes=%llu\n", (unsigned long long)right_info->offset,
+                       (unsigned long long)right_info->bytes,
+                       (unsigned long long)offset,
+                       (unsigned long long)bytes);
+                BUG();
+        }
+        if (left_info) {
+                unlink_free_space(block_group, left_info);
+                if (unlikely((left_info->offset + left_info->bytes) !=
+                             offset)) {
+                        printk(KERN_ERR "btrfs free space to the left "
+                               "of new free space isn't "
+                               "quite right. existing: offset=%llu, "
+                               "bytes=%llu. new: offset=%llu, bytes=%llu\n",
+                               (unsigned long long)left_info->offset,
+                               (unsigned long long)left_info->bytes,
+                               (unsigned long long)offset,
+                               (unsigned long long)bytes);
+                        BUG();
+                }
+                if (info) {
+                        info->offset = left_info->offset;
+                        info->bytes += left_info->bytes;
+                        kfree(left_info);
+                } else {
+                        info = left_info;
+                        info->bytes += bytes;
+                }
+        }
+        if (info) {
+                ret = link_free_space(block_group, info);
+                if (!ret)
+                        info = NULL;
+                goto out;
+        }
+        info = alloc_info;
+        alloc_info = NULL;
+        info->offset = offset;
+        info->bytes = bytes;
+        ret = link_free_space(block_group, info);
+        if (ret)
+                kfree(info);
+out:
+        if (ret) {
+                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+                if (ret == -EEXIST)
+                        BUG();
+        }
+        kfree(alloc_info);
+        return ret;
+}
+static int
+__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                          u64 offset, u64 bytes)
+{
+        struct btrfs_free_space *info;
+        int ret = 0;
+        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+                                  1);
+        if (info && info->offset == offset) {
+                if (info->bytes < bytes) {
+                        printk(KERN_ERR "Found free space at %llu, size %llu,"
+                               "trying to use %llu\n",
+                               (unsigned long long)info->offset,
+                               (unsigned long long)info->bytes,
+                               (unsigned long long)bytes);
+                        WARN_ON(1);
+                        ret = -EINVAL;
+                        goto out;
+                }
+                unlink_free_space(block_group, info);
+                if (info->bytes == bytes) {
+                        kfree(info);
+                        goto out;
+                }
+                info->offset += bytes;
+                info->bytes -= bytes;
+                ret = link_free_space(block_group, info);
+                BUG_ON(ret);
+        } else if (info && info->offset < offset &&
+                   info->offset + info->bytes >= offset + bytes) {
+                u64 old_start = info->offset;
+                /*
+                 * we're freeing space in the middle of the info,
+                 * this can happen during tree log replay
+                 *
+                 * first unlink the old info and then
+                 * insert it again after the hole we're creating
+                 */
+                unlink_free_space(block_group, info);
+                if (offset + bytes < info->offset + info->bytes) {
+                        u64 old_end = info->offset + info->bytes;
+                        info->offset = offset + bytes;
+                        info->bytes = old_end - info->offset;
+                        ret = link_free_space(block_group, info);
+                        BUG_ON(ret);
+                } else {
+                        /* the hole we're creating ends at the end
+                         * of the info struct, just free the info
+                         */
+                        kfree(info);
+                }
+                /* step two, insert a new info struct to cover anything
+                 * before the hole
+                 */
+                ret = __btrfs_add_free_space(block_group, old_start,
+                                             offset - old_start);
+                BUG_ON(ret);
+        } else {
+                WARN_ON(1);
+        }
+out:
+        return ret;
+}
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                         u64 offset, u64 bytes)
+{
+        int ret;
+        struct btrfs_free_space *sp;
+        mutex_lock(&block_group->alloc_mutex);
+        ret = __btrfs_add_free_space(block_group, offset, bytes);
+        sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+        BUG_ON(!sp);
+        mutex_unlock(&block_group->alloc_mutex);
+        return ret;
+}
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+                              u64 offset, u64 bytes)
+{
+        int ret;
+        struct btrfs_free_space *sp;
+        ret = __btrfs_add_free_space(block_group, offset, bytes);
+        sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+        BUG_ON(!sp);
+        return ret;
+}
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+                            u64 offset, u64 bytes)
+{
+        int ret = 0;
+        mutex_lock(&block_group->alloc_mutex);
+        ret = __btrfs_remove_free_space(block_group, offset, bytes);
+        mutex_unlock(&block_group->alloc_mutex);
+        return ret;
+}
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+                                 u64 offset, u64 bytes)
+{
+        int ret;
+        ret = __btrfs_remove_free_space(block_group, offset, bytes);
+        return ret;
+}
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+                           u64 bytes)
+{
+        struct btrfs_free_space *info;
+        struct rb_node *n;
+        int count = 0;
+        for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+                info = rb_entry(n, struct btrfs_free_space, offset_index);
+                if (info->bytes >= bytes)
+                        count++;
+        }
+        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
+               "\n", count);
+}
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_free_space *info;
+        struct rb_node *n;
+        u64 ret = 0;
+        for (n = rb_first(&block_group->free_space_offset); n;
+             n = rb_next(n)) {
+                info = rb_entry(n, struct btrfs_free_space, offset_index);
+                ret += info->bytes;
+        }
+        return ret;
+}
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_free_space *info;
+        struct rb_node *node;
+        mutex_lock(&block_group->alloc_mutex);
+        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+                info = rb_entry(node, struct btrfs_free_space, bytes_index);
+                unlink_free_space(block_group, info);
+                kfree(info);
+                if (need_resched()) {
+                        mutex_unlock(&block_group->alloc_mutex);
+                        cond_resched();
+                        mutex_lock(&block_group->alloc_mutex);
+                }
+        }
+        mutex_unlock(&block_group->alloc_mutex);
+}
+#if 0
+static struct btrfs_free_space *btrfs_find_free_space_offset(struct
+                                                      btrfs_block_group_cache
+                                                      *block_group, u64 offset,
+                                                      u64 bytes)
+{
+        struct btrfs_free_space *ret;
+        mutex_lock(&block_group->alloc_mutex);
+        ret = tree_search_offset(&block_group->free_space_offset, offset,
+                                 bytes, 0);
+        mutex_unlock(&block_group->alloc_mutex);
+        return ret;
+}
+static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+                                                     btrfs_block_group_cache
+                                                     *block_group, u64 offset,
+                                                     u64 bytes)
+{
+        struct btrfs_free_space *ret;
+        mutex_lock(&block_group->alloc_mutex);
+        ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
+        mutex_unlock(&block_group->alloc_mutex);
+        return ret;
+}
+#endif
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+                                               *block_group, u64 offset,
+                                               u64 bytes)
+{
+        struct btrfs_free_space *ret = NULL;
+        ret = tree_search_offset(&block_group->free_space_offset, offset,
+                                 bytes, 0);
+        if (!ret)
+                ret = tree_search_bytes(&block_group->free_space_bytes,
+                                        offset, bytes);
+        return ret;
+}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 000000000000..2a020b276768
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __HASH__
+#define __HASH__
+#include "crc32c.h"
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+        return btrfs_crc32c((u32)~1, name, len);
+}
+#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 000000000000..3d46fa1f29a4
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
+                         int name_len, struct btrfs_inode_ref **ref_ret)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_inode_ref *ref;
+        unsigned long ptr;
+        unsigned long name_ptr;
+        u32 item_size;
+        u32 cur_offset = 0;
+        int len;
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+        while (cur_offset < item_size) {
+                ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+                len = btrfs_inode_ref_name_len(leaf, ref);
+                name_ptr = (unsigned long)(ref + 1);
+                cur_offset += len + sizeof(*ref);
+                if (len != name_len)
+                        continue;
+                if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
+                        *ref_ret = ref;
+                        return 1;
+                }
+        }
+        return 0;
+}
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           const char *name, int name_len,
+                           u64 inode_objectid, u64 ref_objectid, u64 *index)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_inode_ref *ref;
+        struct extent_buffer *leaf;
+        unsigned long ptr;
+        unsigned long item_start;
+        u32 item_size;
+        u32 sub_item_len;
+        int ret;
+        int del_len = name_len + sizeof(*ref);
+        key.objectid = inode_objectid;
+        key.offset = ref_objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        } else if (ret < 0) {
+                goto out;
+        }
+        if (!find_name_in_backref(path, name, name_len, &ref)) {
+                ret = -ENOENT;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        if (index)
+                *index = btrfs_inode_ref_index(leaf, ref);
+        if (del_len == item_size) {
+                ret = btrfs_del_item(trans, root, path);
+                goto out;
+        }
+        ptr = (unsigned long)ref;
+        sub_item_len = name_len + sizeof(*ref);
+        item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+        memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+                              item_size - (ptr + sub_item_len - item_start));
+        ret = btrfs_truncate_item(trans, root, path,
+                                  item_size - sub_item_len, 1);
+        BUG_ON(ret);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           const char *name, int name_len,
+                           u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_inode_ref *ref;
+        unsigned long ptr;
+        int ret;
+        int ins_len = name_len + sizeof(*ref);
+        key.objectid = inode_objectid;
+        key.offset = ref_objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      ins_len);
+        if (ret == -EEXIST) {
+                u32 old_size;
+                if (find_name_in_backref(path, name, name_len, &ref))
+                        goto out;
+                old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+                ret = btrfs_extend_item(trans, root, path, ins_len);
+                BUG_ON(ret);
+                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                     struct btrfs_inode_ref);
+                ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+                btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+                ptr = (unsigned long)(ref + 1);
+                ret = 0;
+        } else if (ret < 0) {
+                goto out;
+        } else {
+                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                     struct btrfs_inode_ref);
+                btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+                btrfs_set_inode_ref_index(path->nodes[0], ref, index);
+                ptr = (unsigned long)(ref + 1);
+        }
+        write_extent_buffer(path->nodes[0], name, ptr, name_len);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = objectid;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      sizeof(struct btrfs_inode_item));
+        if (ret == 0 && objectid > root->highest_inode)
+                root->highest_inode = objectid;
+        return ret;
+}
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+                       *root, struct btrfs_path *path,
+                       struct btrfs_key *location, int mod)
+{
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        int ret;
+        int slot;
+        struct extent_buffer *leaf;
+        struct btrfs_key found_key;
+        ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+        if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+            location->offset == (u64)-1 && path->slots[0] != 0) {
+                slot = path->slots[0] - 1;
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (found_key.objectid == location->objectid &&
+                    btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+                        path->slots[0]--;
+                        return 0;
+                }
+        }
+        return ret;
+}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 000000000000..2aa79873eb46
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct extent_buffer *l;
+        struct btrfs_key search_key;
+        struct btrfs_key found_key;
+        int slot;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+        search_key.type = -1;
+        search_key.offset = (u64)-1;
+        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        BUG_ON(ret == 0);
+        if (path->slots[0] > 0) {
+                slot = path->slots[0] - 1;
+                l = path->nodes[0];
+                btrfs_item_key_to_cpu(l, &found_key, slot);
+                *objectid = found_key.objectid;
+        } else {
+                *objectid = BTRFS_FIRST_FREE_OBJECTID;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * walks the btree of allocated inodes and find a hole.
+ */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             u64 dirid, u64 *objectid)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret;
+        int slot = 0;
+        u64 last_ino = 0;
+        int start_found;
+        struct extent_buffer *l;
+        struct btrfs_key search_key;
+        u64 search_start = dirid;
+        mutex_lock(&root->objectid_mutex);
+        if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
+            root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
+                *objectid = ++root->last_inode_alloc;
+                mutex_unlock(&root->objectid_mutex);
+                return 0;
+        }
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
+        search_key.objectid = search_start;
+        search_key.type = 0;
+        search_key.offset = 0;
+        btrfs_init_path(path);
+        start_found = 0;
+        ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        while (1) {
+                l = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(l)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto error;
+                        if (!start_found) {
+                                *objectid = search_start;
+                                start_found = 1;
+                                goto found;
+                        }
+                        *objectid = last_ino > search_start ?
+                                last_ino : search_start;
+                        goto found;
+                }
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid >= search_start) {
+                        if (start_found) {
+                                if (last_ino < search_start)
+                                        last_ino = search_start;
+                                if (key.objectid > last_ino) {
+                                        *objectid = last_ino;
+                                        goto found;
+                                }
+                        } else if (key.objectid > search_start) {
+                                *objectid = search_start;
+                                goto found;
+                        }
+                }
+                if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
+                        break;
+                start_found = 1;
+                last_ino = key.objectid + 1;
+                path->slots[0]++;
+        }
+        BUG_ON(1);
+found:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        BUG_ON(*objectid < search_start);
+        mutex_unlock(&root->objectid_mutex);
+        return 0;
+error:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        mutex_unlock(&root->objectid_mutex);
+        return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 000000000000..8adfe059ab41
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,5035 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/falloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "xattr.h"
+#include "tree-log.h"
+#include "ref-cache.h"
+#include "compression.h"
+struct btrfs_iget_args {
+        u64 ino;
+        struct btrfs_root *root;
+};
+static struct inode_operations btrfs_dir_inode_operations;
+static struct inode_operations btrfs_symlink_inode_operations;
+static struct inode_operations btrfs_dir_ro_inode_operations;
+static struct inode_operations btrfs_special_inode_operations;
+static struct inode_operations btrfs_file_inode_operations;
+static struct address_space_operations btrfs_aops;
+static struct address_space_operations btrfs_symlink_aops;
+static struct file_operations btrfs_dir_file_operations;
+static struct extent_io_ops btrfs_extent_io_ops;
+static struct kmem_cache *btrfs_inode_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_bit_radix_cachep;
+struct kmem_cache *btrfs_path_cachep;
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+        [S_IFREG >> S_SHIFT]    = BTRFS_FT_REG_FILE,
+        [S_IFDIR >> S_SHIFT]    = BTRFS_FT_DIR,
+        [S_IFCHR >> S_SHIFT]    = BTRFS_FT_CHRDEV,
+        [S_IFBLK >> S_SHIFT]    = BTRFS_FT_BLKDEV,
+        [S_IFIFO >> S_SHIFT]    = BTRFS_FT_FIFO,
+        [S_IFSOCK >> S_SHIFT]   = BTRFS_FT_SOCK,
+        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
+};
+static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static noinline int cow_file_range(struct inode *inode,
+                                   struct page *locked_page,
+                                   u64 start, u64 end, int *page_started,
+                                   unsigned long *nr_written, int unlock);
+/*
+ * a very lame attempt at stopping writes when the FS is 85% full.  There
+ * are countless ways this is incorrect, but it is better than nothing.
+ */
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+                           int for_del)
+{
+        u64 total;
+        u64 used;
+        u64 thresh;
+        int ret = 0;
+        spin_lock(&root->fs_info->delalloc_lock);
+        total = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        used = btrfs_super_bytes_used(&root->fs_info->super_copy);
+        if (for_del)
+                thresh = total * 90;
+        else
+                thresh = total * 85;
+        do_div(thresh, 100);
+        if (used + root->fs_info->delalloc_bytes + num_required > thresh)
+                ret = -ENOSPC;
+        spin_unlock(&root->fs_info->delalloc_lock);
+        return ret;
+}
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode,
+                                u64 start, size_t size, size_t compressed_size,
+                                struct page **compressed_pages)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct page *page = NULL;
+        char *kaddr;
+        unsigned long ptr;
+        struct btrfs_file_extent_item *ei;
+        int err = 0;
+        int ret;
+        size_t cur_size = size;
+        size_t datasize;
+        unsigned long offset;
+        int use_compress = 0;
+        if (compressed_size && compressed_pages) {
+                use_compress = 1;
+                cur_size = compressed_size;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        btrfs_set_trans_block_group(trans, inode);
+        key.objectid = inode->i_ino;
+        key.offset = start;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+        datasize = btrfs_file_extent_calc_inline_size(cur_size);
+        inode_add_bytes(inode, size);
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      datasize);
+        BUG_ON(ret);
+        if (ret) {
+                err = ret;
+                goto fail;
+        }
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+        btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+        btrfs_set_file_extent_encryption(leaf, ei, 0);
+        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+        ptr = btrfs_file_extent_inline_start(ei);
+        if (use_compress) {
+                struct page *cpage;
+                int i = 0;
+                while (compressed_size > 0) {
+                        cpage = compressed_pages[i];
+                        cur_size = min_t(unsigned long, compressed_size,
+                                       PAGE_CACHE_SIZE);
+                        kaddr = kmap(cpage);
+                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
+                        kunmap(cpage);
+                        i++;
+                        ptr += cur_size;
+                        compressed_size -= cur_size;
+                }
+                btrfs_set_file_extent_compression(leaf, ei,
+                                                  BTRFS_COMPRESS_ZLIB);
+        } else {
+                page = find_get_page(inode->i_mapping,
+                                     start >> PAGE_CACHE_SHIFT);
+                btrfs_set_file_extent_compression(leaf, ei, 0);
+                kaddr = kmap_atomic(page, KM_USER0);
+                offset = start & (PAGE_CACHE_SIZE - 1);
+                write_extent_buffer(leaf, kaddr + offset, ptr, size);
+                kunmap_atomic(kaddr, KM_USER0);
+                page_cache_release(page);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        BTRFS_I(inode)->disk_i_size = inode->i_size;
+        btrfs_update_inode(trans, root, inode);
+        return 0;
+fail:
+        btrfs_free_path(path);
+        return err;
+}
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct inode *inode, u64 start, u64 end,
+                                 size_t compressed_size,
+                                 struct page **compressed_pages)
+{
+        u64 isize = i_size_read(inode);
+        u64 actual_end = min(end + 1, isize);
+        u64 inline_len = actual_end - start;
+        u64 aligned_end = (end + root->sectorsize - 1) &
+                        ~((u64)root->sectorsize - 1);
+        u64 hint_byte;
+        u64 data_len = inline_len;
+        int ret;
+        if (compressed_size)
+                data_len = compressed_size;
+        if (start > 0 ||
+            actual_end >= PAGE_CACHE_SIZE ||
+            data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+            (!compressed_size &&
+            (actual_end & (root->sectorsize - 1)) == 0) ||
+            end + 1 < isize ||
+            data_len > root->fs_info->max_inline) {
+                return 1;
+        }
+        ret = btrfs_drop_extents(trans, root, inode, start,
+                                 aligned_end, start, &hint_byte);
+        BUG_ON(ret);
+        if (isize > actual_end)
+                inline_len = min_t(u64, isize, actual_end);
+        ret = insert_inline_extent(trans, root, inode, start,
+                                   inline_len, compressed_size,
+                                   compressed_pages);
+        BUG_ON(ret);
+        btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+        return 0;
+}
+struct async_extent {
+        u64 start;
+        u64 ram_size;
+        u64 compressed_size;
+        struct page **pages;
+        unsigned long nr_pages;
+        struct list_head list;
+};
+struct async_cow {
+        struct inode *inode;
+        struct btrfs_root *root;
+        struct page *locked_page;
+        u64 start;
+        u64 end;
+        struct list_head extents;
+        struct btrfs_work work;
+};
+static noinline int add_async_extent(struct async_cow *cow,
+                                     u64 start, u64 ram_size,
+                                     u64 compressed_size,
+                                     struct page **pages,
+                                     unsigned long nr_pages)
+{
+        struct async_extent *async_extent;
+        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+        async_extent->start = start;
+        async_extent->ram_size = ram_size;
+        async_extent->compressed_size = compressed_size;
+        async_extent->pages = pages;
+        async_extent->nr_pages = nr_pages;
+        list_add_tail(&async_extent->list, &cow->extents);
+        return 0;
+}
+/*
+ * we create compressed extents in two phases.  The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
+ *
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus.  The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
+ *
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes.  This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that pdflush sent them down.
+ */
+static noinline int compress_file_range(struct inode *inode,
+                                        struct page *locked_page,
+                                        u64 start, u64 end,
+                                        struct async_cow *async_cow,
+                                        int *num_added)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        u64 num_bytes;
+        u64 orig_start;
+        u64 disk_num_bytes;
+        u64 blocksize = root->sectorsize;
+        u64 actual_end;
+        u64 isize = i_size_read(inode);
+        int ret = 0;
+        struct page **pages = NULL;
+        unsigned long nr_pages;
+        unsigned long nr_pages_ret = 0;
+        unsigned long total_compressed = 0;
+        unsigned long total_in = 0;
+        unsigned long max_compressed = 128 * 1024;
+        unsigned long max_uncompressed = 128 * 1024;
+        int i;
+        int will_compress;
+        orig_start = start;
+        actual_end = min_t(u64, isize, end + 1);
+again:
+        will_compress = 0;
+        nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+        nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+        total_compressed = actual_end - start;
+        /* we want to make sure that amount of ram required to uncompress
+         * an extent is reasonable, so we limit the total size in ram
+         * of a compressed extent to 128k.  This is a crucial number
+         * because it also controls how easily we can spread reads across
+         * cpus for decompression.
+         *
+         * We also want to make sure the amount of IO required to do
+         * a random read is reasonably small, so we limit the size of
+         * a compressed extent to 128k.
+         */
+        total_compressed = min(total_compressed, max_uncompressed);
+        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+        num_bytes = max(blocksize,  num_bytes);
+        disk_num_bytes = num_bytes;
+        total_in = 0;
+        ret = 0;
+        /*
+         * we do compression for mount -o compress and when the
+         * inode has not been flagged as nocompress.  This flag can
+         * change at any time if we discover bad compression ratios.
+         */
+        if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+            btrfs_test_opt(root, COMPRESS)) {
+                WARN_ON(pages);
+                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+                ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+                                                total_compressed, pages,
+                                                nr_pages, &nr_pages_ret,
+                                                &total_in,
+                                                &total_compressed,
+                                                max_compressed);
+                if (!ret) {
+                        unsigned long offset = total_compressed &
+                                (PAGE_CACHE_SIZE - 1);
+                        struct page *page = pages[nr_pages_ret - 1];
+                        char *kaddr;
+                        /* zero the tail end of the last page, we might be
+                         * sending it down to disk
+                         */
+                        if (offset) {
+                                kaddr = kmap_atomic(page, KM_USER0);
+                                memset(kaddr + offset, 0,
+                                       PAGE_CACHE_SIZE - offset);
+                                kunmap_atomic(kaddr, KM_USER0);
+                        }
+                        will_compress = 1;
+                }
+        }
+        if (start == 0) {
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(!trans);
+                btrfs_set_trans_block_group(trans, inode);
+                /* lets try to make an inline extent */
+                if (ret || total_in < (actual_end - start)) {
+                        /* we didn't compress the entire range, try
+                         * to make an uncompressed inline extent.
+                         */
+                        ret = cow_file_range_inline(trans, root, inode,
+                                                    start, end, 0, NULL);
+                } else {
+                        /* try making a compressed inline extent */
+                        ret = cow_file_range_inline(trans, root, inode,
+                                                    start, end,
+                                                    total_compressed, pages);
+                }
+                btrfs_end_transaction(trans, root);
+                if (ret == 0) {
+                        /*
+                         * inline extent creation worked, we don't need
+                         * to create any more async work items.  Unlock
+                         * and free up our temp pages.
+                         */
+                        extent_clear_unlock_delalloc(inode,
+                                                     &BTRFS_I(inode)->io_tree,
+                                                     start, end, NULL, 1, 0,
+                                                     0, 1, 1, 1);
+                        ret = 0;
+                        goto free_pages_out;
+                }
+        }
+        if (will_compress) {
+                /*
+                 * we aren't doing an inline extent round the compressed size
+                 * up to a block size boundary so the allocator does sane
+                 * things
+                 */
+                total_compressed = (total_compressed + blocksize - 1) &
+                        ~(blocksize - 1);
+                /*
+                 * one last check to make sure the compression is really a
+                 * win, compare the page count read with the blocks on disk
+                 */
+                total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+                        ~(PAGE_CACHE_SIZE - 1);
+                if (total_compressed >= total_in) {
+                        will_compress = 0;
+                } else {
+                        disk_num_bytes = total_compressed;
+                        num_bytes = total_in;
+                }
+        }
+        if (!will_compress && pages) {
+                /*
+                 * the compression code ran but failed to make things smaller,
+                 * free any pages it allocated and our page pointer array
+                 */
+                for (i = 0; i < nr_pages_ret; i++) {
+                        WARN_ON(pages[i]->mapping);
+                        page_cache_release(pages[i]);
+                }
+                kfree(pages);
+                pages = NULL;
+                total_compressed = 0;
+                nr_pages_ret = 0;
+                /* flag the file so we don't compress in the future */
+                btrfs_set_flag(inode, NOCOMPRESS);
+        }
+        if (will_compress) {
+                *num_added += 1;
+                /* the async work queues will take care of doing actual
+                 * allocation on disk for these compressed pages,
+                 * and will submit them to the elevator.
+                 */
+                add_async_extent(async_cow, start, num_bytes,
+                                 total_compressed, pages, nr_pages_ret);
+                if (start + num_bytes < end && start + num_bytes < actual_end) {
+                        start += num_bytes;
+                        pages = NULL;
+                        cond_resched();
+                        goto again;
+                }
+        } else {
+                /*
+                 * No compression, but we still need to write the pages in
+                 * the file we've been given so far.  redirty the locked
+                 * page if it corresponds to our extent and set things up
+                 * for the async work queue to run cow_file_range to do
+                 * the normal delalloc dance
+                 */
+                if (page_offset(locked_page) >= start &&
+                    page_offset(locked_page) <= end) {
+                        __set_page_dirty_nobuffers(locked_page);
+                        /* unlocked later on in the async handlers */
+                }
+                add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+                *num_added += 1;
+        }
+out:
+        return 0;
+free_pages_out:
+        for (i = 0; i < nr_pages_ret; i++) {
+                WARN_ON(pages[i]->mapping);
+                page_cache_release(pages[i]);
+        }
+        kfree(pages);
+        goto out;
+}
+/*
+ * phase two of compressed writeback.  This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued.  We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline int submit_compressed_extents(struct inode *inode,
+                                              struct async_cow *async_cow)
+{
+        struct async_extent *async_extent;
+        u64 alloc_hint = 0;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key ins;
+        struct extent_map *em;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_io_tree *io_tree;
+        int ret;
+        if (list_empty(&async_cow->extents))
+                return 0;
+        trans = btrfs_join_transaction(root, 1);
+        while (!list_empty(&async_cow->extents)) {
+                async_extent = list_entry(async_cow->extents.next,
+                                          struct async_extent, list);
+                list_del(&async_extent->list);
+                io_tree = &BTRFS_I(inode)->io_tree;
+                /* did the compression code fall back to uncompressed IO? */
+                if (!async_extent->pages) {
+                        int page_started = 0;
+                        unsigned long nr_written = 0;
+                        lock_extent(io_tree, async_extent->start,
+                                    async_extent->start +
+                                    async_extent->ram_size - 1, GFP_NOFS);
+                        /* allocate blocks */
+                        cow_file_range(inode, async_cow->locked_page,
+                                       async_extent->start,
+                                       async_extent->start +
+                                       async_extent->ram_size - 1,
+                                       &page_started, &nr_written, 0);
+                        /*
+                         * if page_started, cow_file_range inserted an
+                         * inline extent and took care of all the unlocking
+                         * and IO for us.  Otherwise, we need to submit
+                         * all those pages down to the drive.
+                         */
+                        if (!page_started)
+                                extent_write_locked_range(io_tree,
+                                                  inode, async_extent->start,
+                                                  async_extent->start +
+                                                  async_extent->ram_size - 1,
+                                                  btrfs_get_extent,
+                                                  WB_SYNC_ALL);
+                        kfree(async_extent);
+                        cond_resched();
+                        continue;
+                }
+                lock_extent(io_tree, async_extent->start,
+                            async_extent->start + async_extent->ram_size - 1,
+                            GFP_NOFS);
+                /*
+                 * here we're doing allocation and writeback of the
+                 * compressed pages
+                 */
+                btrfs_drop_extent_cache(inode, async_extent->start,
+                                        async_extent->start +
+                                        async_extent->ram_size - 1, 0);
+                ret = btrfs_reserve_extent(trans, root,
+                                           async_extent->compressed_size,
+                                           async_extent->compressed_size,
+                                           0, alloc_hint,
+                                           (u64)-1, &ins, 1);
+                BUG_ON(ret);
+                em = alloc_extent_map(GFP_NOFS);
+                em->start = async_extent->start;
+                em->len = async_extent->ram_size;
+                em->orig_start = em->start;
+                em->block_start = ins.objectid;
+                em->block_len = ins.offset;
+                em->bdev = root->fs_info->fs_devices->latest_bdev;
+                set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                while (1) {
+                        spin_lock(&em_tree->lock);
+                        ret = add_extent_mapping(em_tree, em);
+                        spin_unlock(&em_tree->lock);
+                        if (ret != -EEXIST) {
+                                free_extent_map(em);
+                                break;
+                        }
+                        btrfs_drop_extent_cache(inode, async_extent->start,
+                                                async_extent->start +
+                                                async_extent->ram_size - 1, 0);
+                }
+                ret = btrfs_add_ordered_extent(inode, async_extent->start,
+                                               ins.objectid,
+                                               async_extent->ram_size,
+                                               ins.offset,
+                                               BTRFS_ORDERED_COMPRESSED);
+                BUG_ON(ret);
+                btrfs_end_transaction(trans, root);
+                /*
+                 * clear dirty, set writeback and unlock the pages.
+                 */
+                extent_clear_unlock_delalloc(inode,
+                                             &BTRFS_I(inode)->io_tree,
+                                             async_extent->start,
+                                             async_extent->start +
+                                             async_extent->ram_size - 1,
+                                             NULL, 1, 1, 0, 1, 1, 0);
+                ret = btrfs_submit_compressed_write(inode,
+                                    async_extent->start,
+                                    async_extent->ram_size,
+                                    ins.objectid,
+                                    ins.offset, async_extent->pages,
+                                    async_extent->nr_pages);
+                BUG_ON(ret);
+                trans = btrfs_join_transaction(root, 1);
+                alloc_hint = ins.objectid + ins.offset;
+                kfree(async_extent);
+                cond_resched();
+        }
+        btrfs_end_transaction(trans, root);
+        return 0;
+}
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+                                   struct page *locked_page,
+                                   u64 start, u64 end, int *page_started,
+                                   unsigned long *nr_written,
+                                   int unlock)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        u64 alloc_hint = 0;
+        u64 num_bytes;
+        unsigned long ram_size;
+        u64 disk_num_bytes;
+        u64 cur_alloc_size;
+        u64 blocksize = root->sectorsize;
+        u64 actual_end;
+        u64 isize = i_size_read(inode);
+        struct btrfs_key ins;
+        struct extent_map *em;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        int ret = 0;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(!trans);
+        btrfs_set_trans_block_group(trans, inode);
+        actual_end = min_t(u64, isize, end + 1);
+        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+        num_bytes = max(blocksize,  num_bytes);
+        disk_num_bytes = num_bytes;
+        ret = 0;
+        if (start == 0) {
+                /* lets try to make an inline extent */
+                ret = cow_file_range_inline(trans, root, inode,
+                                            start, end, 0, NULL);
+                if (ret == 0) {
+                        extent_clear_unlock_delalloc(inode,
+                                                     &BTRFS_I(inode)->io_tree,
+                                                     start, end, NULL, 1, 1,
+                                                     1, 1, 1, 1);
+                        *nr_written = *nr_written +
+                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+                        *page_started = 1;
+                        ret = 0;
+                        goto out;
+                }
+        }
+        BUG_ON(disk_num_bytes >
+               btrfs_super_total_bytes(&root->fs_info->super_copy));
+        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+        while (disk_num_bytes > 0) {
+                cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
+                ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+                                           root->sectorsize, 0, alloc_hint,
+                                           (u64)-1, &ins, 1);
+                BUG_ON(ret);
+                em = alloc_extent_map(GFP_NOFS);
+                em->start = start;
+                em->orig_start = em->start;
+                ram_size = ins.offset;
+                em->len = ins.offset;
+                em->block_start = ins.objectid;
+                em->block_len = ins.offset;
+                em->bdev = root->fs_info->fs_devices->latest_bdev;
+                set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                while (1) {
+                        spin_lock(&em_tree->lock);
+                        ret = add_extent_mapping(em_tree, em);
+                        spin_unlock(&em_tree->lock);
+                        if (ret != -EEXIST) {
+                                free_extent_map(em);
+                                break;
+                        }
+                        btrfs_drop_extent_cache(inode, start,
+                                                start + ram_size - 1, 0);
+                }
+                cur_alloc_size = ins.offset;
+                ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+                                               ram_size, cur_alloc_size, 0);
+                BUG_ON(ret);
+                if (root->root_key.objectid ==
+                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                        ret = btrfs_reloc_clone_csums(inode, start,
+                                                      cur_alloc_size);
+                        BUG_ON(ret);
+                }
+                if (disk_num_bytes < cur_alloc_size)
+                        break;
+                /* we're not doing compressed IO, don't unlock the first
+                 * page (which the caller expects to stay locked), don't
+                 * clear any dirty bits and don't set any writeback bits
+                 */
+                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                             start, start + ram_size - 1,
+                                             locked_page, unlock, 1,
+                                             1, 0, 0, 0);
+                disk_num_bytes -= cur_alloc_size;
+                num_bytes -= cur_alloc_size;
+                alloc_hint = ins.objectid + ins.offset;
+                start += cur_alloc_size;
+        }
+out:
+        ret = 0;
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+        struct async_cow *async_cow;
+        int num_added = 0;
+        async_cow = container_of(work, struct async_cow, work);
+        compress_file_range(async_cow->inode, async_cow->locked_page,
+                            async_cow->start, async_cow->end, async_cow,
+                            &num_added);
+        if (num_added == 0)
+                async_cow->inode = NULL;
+}
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+        struct async_cow *async_cow;
+        struct btrfs_root *root;
+        unsigned long nr_pages;
+        async_cow = container_of(work, struct async_cow, work);
+        root = async_cow->root;
+        nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+                PAGE_CACHE_SHIFT;
+        atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
+        if (atomic_read(&root->fs_info->async_delalloc_pages) <
+            5 * 1042 * 1024 &&
+            waitqueue_active(&root->fs_info->async_submit_wait))
+                wake_up(&root->fs_info->async_submit_wait);
+        if (async_cow->inode)
+                submit_compressed_extents(async_cow->inode, async_cow);
+}
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+        struct async_cow *async_cow;
+        async_cow = container_of(work, struct async_cow, work);
+        kfree(async_cow);
+}
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+                                u64 start, u64 end, int *page_started,
+                                unsigned long *nr_written)
+{
+        struct async_cow *async_cow;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        unsigned long nr_pages;
+        u64 cur_end;
+        int limit = 10 * 1024 * 1042;
+        if (!btrfs_test_opt(root, COMPRESS)) {
+                return cow_file_range(inode, locked_page, start, end,
+                                      page_started, nr_written, 1);
+        }
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
+                         EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+        while (start < end) {
+                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+                async_cow->inode = inode;
+                async_cow->root = root;
+                async_cow->locked_page = locked_page;
+                async_cow->start = start;
+                if (btrfs_test_flag(inode, NOCOMPRESS))
+                        cur_end = end;
+                else
+                        cur_end = min(end, start + 512 * 1024 - 1);
+                async_cow->end = cur_end;
+                INIT_LIST_HEAD(&async_cow->extents);
+                async_cow->work.func = async_cow_start;
+                async_cow->work.ordered_func = async_cow_submit;
+                async_cow->work.ordered_free = async_cow_free;
+                async_cow->work.flags = 0;
+                nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+                        PAGE_CACHE_SHIFT;
+                atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+                btrfs_queue_worker(&root->fs_info->delalloc_workers,
+                                   &async_cow->work);
+                if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+                        wait_event(root->fs_info->async_submit_wait,
+                           (atomic_read(&root->fs_info->async_delalloc_pages) <
+                            limit));
+                }
+                while (atomic_read(&root->fs_info->async_submit_draining) &&
+                      atomic_read(&root->fs_info->async_delalloc_pages)) {
+                        wait_event(root->fs_info->async_submit_wait,
+                          (atomic_read(&root->fs_info->async_delalloc_pages) ==
+                           0));
+                }
+                *nr_written += nr_pages;
+                start = cur_end + 1;
+        }
+        *page_started = 1;
+        return 0;
+}
+static noinline int csum_exist_in_range(struct btrfs_root *root,
+                                        u64 bytenr, u64 num_bytes)
+{
+        int ret;
+        struct btrfs_ordered_sum *sums;
+        LIST_HEAD(list);
+        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+                                       bytenr + num_bytes - 1, &list);
+        if (ret == 0 && list_empty(&list))
+                return 0;
+        while (!list_empty(&list)) {
+                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+                list_del(&sums->list);
+                kfree(sums);
+        }
+        return 1;
+}
+/*
+ * when nowcow writeback call back.  This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+                              u64 start, u64 end, int *page_started, int force,
+                              unsigned long *nr_written)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct extent_buffer *leaf;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key found_key;
+        u64 cow_start;
+        u64 cur_offset;
+        u64 extent_end;
+        u64 disk_bytenr;
+        u64 num_bytes;
+        int extent_type;
+        int ret;
+        int type;
+        int nocow;
+        int check_prev = 1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(!trans);
+        cow_start = (u64)-1;
+        cur_offset = start;
+        while (1) {
+                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                               cur_offset, 0);
+                BUG_ON(ret < 0);
+                if (ret > 0 && path->slots[0] > 0 && check_prev) {
+                        leaf = path->nodes[0];
+                        btrfs_item_key_to_cpu(leaf, &found_key,
+                                              path->slots[0] - 1);
+                        if (found_key.objectid == inode->i_ino &&
+                            found_key.type == BTRFS_EXTENT_DATA_KEY)
+                                path->slots[0]--;
+                }
+                check_prev = 0;
+next_slot:
+                leaf = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                BUG_ON(1);
+                        if (ret > 0)
+                                break;
+                        leaf = path->nodes[0];
+                }
+                nocow = 0;
+                disk_bytenr = 0;
+                num_bytes = 0;
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid > inode->i_ino ||
+                    found_key.type > BTRFS_EXTENT_DATA_KEY ||
+                    found_key.offset > end)
+                        break;
+                if (found_key.offset > cur_offset) {
+                        extent_end = found_key.offset;
+                        goto out_check;
+                }
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                extent_type = btrfs_file_extent_type(leaf, fi);
+                if (extent_type == BTRFS_FILE_EXTENT_REG ||
+                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                        extent_end = found_key.offset +
+                                btrfs_file_extent_num_bytes(leaf, fi);
+                        if (extent_end <= start) {
+                                path->slots[0]++;
+                                goto next_slot;
+                        }
+                        if (disk_bytenr == 0)
+                                goto out_check;
+                        if (btrfs_file_extent_compression(leaf, fi) ||
+                            btrfs_file_extent_encryption(leaf, fi) ||
+                            btrfs_file_extent_other_encoding(leaf, fi))
+                                goto out_check;
+                        if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+                                goto out_check;
+                        if (btrfs_extent_readonly(root, disk_bytenr))
+                                goto out_check;
+                        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                                  disk_bytenr))
+                                goto out_check;
+                        disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+                        disk_bytenr += cur_offset - found_key.offset;
+                        num_bytes = min(end + 1, extent_end) - cur_offset;
+                        /*
+                         * force cow if csum exists in the range.
+                         * this ensure that csum for a given extent are
+                         * either valid or do not exist.
+                         */
+                        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                                goto out_check;
+                        nocow = 1;
+                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                        extent_end = found_key.offset +
+                                btrfs_file_extent_inline_len(leaf, fi);
+                        extent_end = ALIGN(extent_end, root->sectorsize);
+                } else {
+                        BUG_ON(1);
+                }
+out_check:
+                if (extent_end <= start) {
+                        path->slots[0]++;
+                        goto next_slot;
+                }
+                if (!nocow) {
+                        if (cow_start == (u64)-1)
+                                cow_start = cur_offset;
+                        cur_offset = extent_end;
+                        if (cur_offset > end)
+                                break;
+                        path->slots[0]++;
+                        goto next_slot;
+                }
+                btrfs_release_path(root, path);
+                if (cow_start != (u64)-1) {
+                        ret = cow_file_range(inode, locked_page, cow_start,
+                                        found_key.offset - 1, page_started,
+                                        nr_written, 1);
+                        BUG_ON(ret);
+                        cow_start = (u64)-1;
+                }
+                if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                        struct extent_map *em;
+                        struct extent_map_tree *em_tree;
+                        em_tree = &BTRFS_I(inode)->extent_tree;
+                        em = alloc_extent_map(GFP_NOFS);
+                        em->start = cur_offset;
+                        em->orig_start = em->start;
+                        em->len = num_bytes;
+                        em->block_len = num_bytes;
+                        em->block_start = disk_bytenr;
+                        em->bdev = root->fs_info->fs_devices->latest_bdev;
+                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+                        while (1) {
+                                spin_lock(&em_tree->lock);
+                                ret = add_extent_mapping(em_tree, em);
+                                spin_unlock(&em_tree->lock);
+                                if (ret != -EEXIST) {
+                                        free_extent_map(em);
+                                        break;
+                                }
+                                btrfs_drop_extent_cache(inode, em->start,
+                                                em->start + em->len - 1, 0);
+                        }
+                        type = BTRFS_ORDERED_PREALLOC;
+                } else {
+                        type = BTRFS_ORDERED_NOCOW;
+                }
+                ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
+                                               num_bytes, num_bytes, type);
+                BUG_ON(ret);
+                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                        cur_offset, cur_offset + num_bytes - 1,
+                                        locked_page, 1, 1, 1, 0, 0, 0);
+                cur_offset = extent_end;
+                if (cur_offset > end)
+                        break;
+        }
+        btrfs_release_path(root, path);
+        if (cur_offset <= end && cow_start == (u64)-1)
+                cow_start = cur_offset;
+        if (cow_start != (u64)-1) {
+                ret = cow_file_range(inode, locked_page, cow_start, end,
+                                     page_started, nr_written, 1);
+                BUG_ON(ret);
+        }
+        ret = btrfs_end_transaction(trans, root);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return 0;
+}
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+                              u64 start, u64 end, int *page_started,
+                              unsigned long *nr_written)
+{
+        int ret;
+        if (btrfs_test_flag(inode, NODATACOW))
+                ret = run_delalloc_nocow(inode, locked_page, start, end,
+                                         page_started, 1, nr_written);
+        else if (btrfs_test_flag(inode, PREALLOC))
+                ret = run_delalloc_nocow(inode, locked_page, start, end,
+                                         page_started, 0, nr_written);
+        else
+                ret = cow_file_range_async(inode, locked_page, start, end,
+                                           page_started, nr_written);
+        return ret;
+}
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
+static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+                       unsigned long old, unsigned long bits)
+{
+        /*
+         * set_bit and clear bit hooks normally require _irqsave/restore
+         * but in this case, we are only testeing for the DELALLOC
+         * bit, which is only set or cleared with irqs on
+         */
+        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+                struct btrfs_root *root = BTRFS_I(inode)->root;
+                spin_lock(&root->fs_info->delalloc_lock);
+                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+                root->fs_info->delalloc_bytes += end - start + 1;
+                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                                      &root->fs_info->delalloc_inodes);
+                }
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
+        return 0;
+}
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
+static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+                         unsigned long old, unsigned long bits)
+{
+        /*
+         * set_bit and clear bit hooks normally require _irqsave/restore
+         * but in this case, we are only testeing for the DELALLOC
+         * bit, which is only set or cleared with irqs on
+         */
+        if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+                struct btrfs_root *root = BTRFS_I(inode)->root;
+                spin_lock(&root->fs_info->delalloc_lock);
+                if (end - start + 1 > root->fs_info->delalloc_bytes) {
+                        printk(KERN_INFO "btrfs warning: delalloc account "
+                               "%llu %llu\n",
+                               (unsigned long long)end - start + 1,
+                               (unsigned long long)
+                               root->fs_info->delalloc_bytes);
+                        root->fs_info->delalloc_bytes = 0;
+                        BTRFS_I(inode)->delalloc_bytes = 0;
+                } else {
+                        root->fs_info->delalloc_bytes -= end - start + 1;
+                        BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+                }
+                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+                }
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
+        return 0;
+}
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+                         size_t size, struct bio *bio,
+                         unsigned long bio_flags)
+{
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        struct btrfs_mapping_tree *map_tree;
+        u64 logical = (u64)bio->bi_sector << 9;
+        u64 length = 0;
+        u64 map_length;
+        int ret;
+        if (bio_flags & EXTENT_BIO_COMPRESSED)
+                return 0;
+        length = bio->bi_size;
+        map_tree = &root->fs_info->mapping_tree;
+        map_length = length;
+        ret = btrfs_map_block(map_tree, READ, logical,
+                              &map_length, NULL, 0);
+        if (map_length < length + size)
+                return 1;
+        return 0;
+}
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+                                    struct bio *bio, int mirror_num,
+                                    unsigned long bio_flags)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+        BUG_ON(ret);
+        return 0;
+}
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+                          int mirror_num, unsigned long bio_flags)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+}
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation
+ * on write, or reading the csums from the tree before a read
+ */
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+                          int mirror_num, unsigned long bio_flags)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        int skip_sum;
+        skip_sum = btrfs_test_flag(inode, NODATASUM);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        BUG_ON(ret);
+        if (!(rw & (1 << BIO_RW))) {
+                if (bio_flags & EXTENT_BIO_COMPRESSED) {
+                        return btrfs_submit_compressed_read(inode, bio,
+                                                    mirror_num, bio_flags);
+                } else if (!skip_sum)
+                        btrfs_lookup_bio_sums(root, inode, bio, NULL);
+                goto mapit;
+        } else if (!skip_sum) {
+                /* csum items have already been cloned */
+                if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+                        goto mapit;
+                /* we're doing a write, do the async checksumming */
+                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                   inode, rw, bio, mirror_num,
+                                   bio_flags, __btrfs_submit_bio_start,
+                                   __btrfs_submit_bio_done);
+        }
+mapit:
+        return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+}
+/*
+ * given a list of ordered sums record them in the inode.  This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
+static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
+                             struct inode *inode, u64 file_offset,
+                             struct list_head *list)
+{
+        struct list_head *cur;
+        struct btrfs_ordered_sum *sum;
+        btrfs_set_trans_block_group(trans, inode);
+        list_for_each(cur, list) {
+                sum = list_entry(cur, struct btrfs_ordered_sum, list);
+                btrfs_csum_file_blocks(trans,
+                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
+        }
+        return 0;
+}
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+{
+        if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
+                WARN_ON(1);
+        return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+                                   GFP_NOFS);
+}
+/* see btrfs_writepage_start_hook for details on why this is required */
+struct btrfs_writepage_fixup {
+        struct page *page;
+        struct btrfs_work work;
+};
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+        struct btrfs_writepage_fixup *fixup;
+        struct btrfs_ordered_extent *ordered;
+        struct page *page;
+        struct inode *inode;
+        u64 page_start;
+        u64 page_end;
+        fixup = container_of(work, struct btrfs_writepage_fixup, work);
+        page = fixup->page;
+again:
+        lock_page(page);
+        if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+                ClearPageChecked(page);
+                goto out_page;
+        }
+        inode = page->mapping->host;
+        page_start = page_offset(page);
+        page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+        lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+        /* already ordered? We're done */
+        if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+                             EXTENT_ORDERED, 0)) {
+                goto out;
+        }
+        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+        if (ordered) {
+                unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
+                              page_end, GFP_NOFS);
+                unlock_page(page);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                goto again;
+        }
+        btrfs_set_extent_delalloc(inode, page_start, page_end);
+        ClearPageChecked(page);
+out:
+        unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+out_page:
+        unlock_page(page);
+        page_cache_release(page);
+}
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea.  This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the ORDERED bit set
+ * hasn't been properly setup for IO.  We kick off an async process
+ * to fix it up.  The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+        struct inode *inode = page->mapping->host;
+        struct btrfs_writepage_fixup *fixup;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+                             EXTENT_ORDERED, 0);
+        if (ret)
+                return 0;
+        if (PageChecked(page))
+                return -EAGAIN;
+        fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+        if (!fixup)
+                return -EAGAIN;
+        SetPageChecked(page);
+        page_cache_get(page);
+        fixup->work.func = btrfs_writepage_fixup_worker;
+        fixup->page = page;
+        btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+        return -EAGAIN;
+}
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+                                       struct inode *inode, u64 file_pos,
+                                       u64 disk_bytenr, u64 disk_num_bytes,
+                                       u64 num_bytes, u64 ram_bytes,
+                                       u8 compression, u8 encryption,
+                                       u16 other_encoding, int extent_type)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_key ins;
+        u64 hint;
+        int ret;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_drop_extents(trans, root, inode, file_pos,
+                                 file_pos + num_bytes, file_pos, &hint);
+        BUG_ON(ret);
+        ins.objectid = inode->i_ino;
+        ins.offset = file_pos;
+        ins.type = BTRFS_EXTENT_DATA_KEY;
+        ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+        btrfs_set_file_extent_type(leaf, fi, extent_type);
+        btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+        btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+        btrfs_set_file_extent_offset(leaf, fi, 0);
+        btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+        btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+        btrfs_set_file_extent_compression(leaf, fi, compression);
+        btrfs_set_file_extent_encryption(leaf, fi, encryption);
+        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+        btrfs_mark_buffer_dirty(leaf);
+        inode_add_bytes(inode, num_bytes);
+        btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
+        ins.objectid = disk_bytenr;
+        ins.offset = disk_num_bytes;
+        ins.type = BTRFS_EXTENT_ITEM_KEY;
+        ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+                                          root->root_key.objectid,
+                                          trans->transid, inode->i_ino, &ins);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return 0;
+}
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_ordered_extent *ordered_extent;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int compressed = 0;
+        int ret;
+        ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+        if (!ret)
+                return 0;
+        trans = btrfs_join_transaction(root, 1);
+        ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+        BUG_ON(!ordered_extent);
+        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
+                goto nocow;
+        lock_extent(io_tree, ordered_extent->file_offset,
+                    ordered_extent->file_offset + ordered_extent->len - 1,
+                    GFP_NOFS);
+        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+                compressed = 1;
+        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+                BUG_ON(compressed);
+                ret = btrfs_mark_extent_written(trans, root, inode,
+                                                ordered_extent->file_offset,
+                                                ordered_extent->file_offset +
+                                                ordered_extent->len);
+                BUG_ON(ret);
+        } else {
+                ret = insert_reserved_file_extent(trans, inode,
+                                                ordered_extent->file_offset,
+                                                ordered_extent->start,
+                                                ordered_extent->disk_len,
+                                                ordered_extent->len,
+                                                ordered_extent->len,
+                                                compressed, 0, 0,
+                                                BTRFS_FILE_EXTENT_REG);
+                BUG_ON(ret);
+        }
+        unlock_extent(io_tree, ordered_extent->file_offset,
+                    ordered_extent->file_offset + ordered_extent->len - 1,
+                    GFP_NOFS);
+nocow:
+        add_pending_csums(trans, inode, ordered_extent->file_offset,
+                          &ordered_extent->list);
+        mutex_lock(&BTRFS_I(inode)->extent_mutex);
+        btrfs_ordered_update_i_size(inode, ordered_extent);
+        btrfs_update_inode(trans, root, inode);
+        btrfs_remove_ordered_extent(inode, ordered_extent);
+        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+        /* once for us */
+        btrfs_put_ordered_extent(ordered_extent);
+        /* once for the tree */
+        btrfs_put_ordered_extent(ordered_extent);
+        btrfs_end_transaction(trans, root);
+        return 0;
+}
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+                                struct extent_state *state, int uptodate)
+{
+        return btrfs_finish_ordered_io(page->mapping->host, start, end);
+}
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+        struct page *page;
+        u64 start;
+        u64 len;
+        u64 logical;
+        unsigned long bio_flags;
+        int last_mirror;
+};
+static int btrfs_io_failed_hook(struct bio *failed_bio,
+                         struct page *page, u64 start, u64 end,
+                         struct extent_state *state)
+{
+        struct io_failure_record *failrec = NULL;
+        u64 private;
+        struct extent_map *em;
+        struct inode *inode = page->mapping->host;
+        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct bio *bio;
+        int num_copies;
+        int ret;
+        int rw;
+        u64 logical;
+        ret = get_state_private(failure_tree, start, &private);
+        if (ret) {
+                failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
+                if (!failrec)
+                        return -ENOMEM;
+                failrec->start = start;
+                failrec->len = end - start + 1;
+                failrec->last_mirror = 0;
+                failrec->bio_flags = 0;
+                spin_lock(&em_tree->lock);
+                em = lookup_extent_mapping(em_tree, start, failrec->len);
+                if (em->start > start || em->start + em->len < start) {
+                        free_extent_map(em);
+                        em = NULL;
+                }
+                spin_unlock(&em_tree->lock);
+                if (!em || IS_ERR(em)) {
+                        kfree(failrec);
+                        return -EIO;
+                }
+                logical = start - em->start;
+                logical = em->block_start + logical;
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+                        logical = em->block_start;
+                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                }
+                failrec->logical = logical;
+                free_extent_map(em);
+                set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
+                                EXTENT_DIRTY, GFP_NOFS);
+                set_state_private(failure_tree, start,
+                                 (u64)(unsigned long)failrec);
+        } else {
+                failrec = (struct io_failure_record *)(unsigned long)private;
+        }
+        num_copies = btrfs_num_copies(
+                              &BTRFS_I(inode)->root->fs_info->mapping_tree,
+                              failrec->logical, failrec->len);
+        failrec->last_mirror++;
+        if (!state) {
+                spin_lock(&BTRFS_I(inode)->io_tree.lock);
+                state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+                                                    failrec->start,
+                                                    EXTENT_LOCKED);
+                if (state && state->start != failrec->start)
+                        state = NULL;
+                spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+        }
+        if (!state || failrec->last_mirror > num_copies) {
+                set_state_private(failure_tree, failrec->start, 0);
+                clear_extent_bits(failure_tree, failrec->start,
+                                  failrec->start + failrec->len - 1,
+                                  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+                kfree(failrec);
+                return -EIO;
+        }
+        bio = bio_alloc(GFP_NOFS, 1);
+        bio->bi_private = state;
+        bio->bi_end_io = failed_bio->bi_end_io;
+        bio->bi_sector = failrec->logical >> 9;
+        bio->bi_bdev = failed_bio->bi_bdev;
+        bio->bi_size = 0;
+        bio_add_page(bio, page, failrec->len, start - page_offset(page));
+        if (failed_bio->bi_rw & (1 << BIO_RW))
+                rw = WRITE;
+        else
+                rw = READ;
+        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+                                                      failrec->last_mirror,
+                                                      failrec->bio_flags);
+        return 0;
+}
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int btrfs_clean_io_failures(struct inode *inode, u64 start)
+{
+        u64 private;
+        u64 private_failure;
+        struct io_failure_record *failure;
+        int ret;
+        private = 0;
+        if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+                             (u64)-1, 1, EXTENT_DIRTY)) {
+                ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+                                        start, &private_failure);
+                if (ret == 0) {
+                        failure = (struct io_failure_record *)(unsigned long)
+                                   private_failure;
+                        set_state_private(&BTRFS_I(inode)->io_failure_tree,
+                                          failure->start, 0);
+                        clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+                                          failure->start,
+                                          failure->start + failure->len - 1,
+                                          EXTENT_DIRTY | EXTENT_LOCKED,
+                                          GFP_NOFS);
+                        kfree(failure);
+                }
+        }
+        return 0;
+}
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish.  If not, we go through
+ * the io_failure_record routines to find good copies
+ */
+static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+                               struct extent_state *state)
+{
+        size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+        struct inode *inode = page->mapping->host;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        char *kaddr;
+        u64 private = ~(u32)0;
+        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u32 csum = ~(u32)0;
+        if (PageChecked(page)) {
+                ClearPageChecked(page);
+                goto good;
+        }
+        if (btrfs_test_flag(inode, NODATASUM))
+                return 0;
+        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+                clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+                                  GFP_NOFS);
+                return 0;
+        }
+        if (state && state->start == start) {
+                private = state->private;
+                ret = 0;
+        } else {
+                ret = get_state_private(io_tree, start, &private);
+        }
+        kaddr = kmap_atomic(page, KM_USER0);
+        if (ret)
+                goto zeroit;
+        csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
+        btrfs_csum_final(csum, (char *)&csum);
+        if (csum != private)
+                goto zeroit;
+        kunmap_atomic(kaddr, KM_USER0);
+good:
+        /* if the io failure tree for this inode is non-empty,
+         * check to see if we've recovered from a failed IO
+         */
+        btrfs_clean_io_failures(inode, start);
+        return 0;
+zeroit:
+        printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+               "private %llu\n", page->mapping->host->i_ino,
+               (unsigned long long)start, csum,
+               (unsigned long long)private);
+        memset(kaddr + offset, 1, end - start + 1);
+        flush_dcache_page(page);
+        kunmap_atomic(kaddr, KM_USER0);
+        if (private == 0)
+                return 0;
+        return -EIO;
+}
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        spin_lock(&root->list_lock);
+        /* already on the orphan list, we're good */
+        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+                spin_unlock(&root->list_lock);
+                return 0;
+        }
+        list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+        spin_unlock(&root->list_lock);
+        /*
+         * insert an orphan item to track this unlinked/truncated file
+         */
+        ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+        return ret;
+}
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        spin_lock(&root->list_lock);
+        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+                spin_unlock(&root->list_lock);
+                return 0;
+        }
+        list_del_init(&BTRFS_I(inode)->i_orphan);
+        if (!trans) {
+                spin_unlock(&root->list_lock);
+                return 0;
+        }
+        spin_unlock(&root->list_lock);
+        ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+        return ret;
+}
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+void btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        struct btrfs_key key, found_key;
+        struct btrfs_trans_handle *trans;
+        struct inode *inode;
+        int ret = 0, nr_unlink = 0, nr_truncate = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return;
+        path->reada = -1;
+        key.objectid = BTRFS_ORPHAN_OBJECTID;
+        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.offset = (u64)-1;
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0) {
+                        printk(KERN_ERR "Error searching slot for orphan: %d"
+                               "\n", ret);
+                        break;
+                }
+                /*
+                 * if ret == 0 means we found what we were searching for, which
+                 * is weird, but possible, so only screw with path if we didnt
+                 * find the key and see if we have stuff that matches
+                 */
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                /* pull out the item */
+                leaf = path->nodes[0];
+                item = btrfs_item_nr(leaf, path->slots[0]);
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                /* make sure the item matches what we want */
+                if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+                        break;
+                if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+                        break;
+                /* release the path since we're done with it */
+                btrfs_release_path(root, path);
+                /*
+                 * this is where we are basically btrfs_lookup, without the
+                 * crossing root thing.  we store the inode number in the
+                 * offset of the orphan item.
+                 */
+                inode = btrfs_iget_locked(root->fs_info->sb,
+                                          found_key.offset, root);
+                if (!inode)
+                        break;
+                if (inode->i_state & I_NEW) {
+                        BTRFS_I(inode)->root = root;
+                        /* have to set the location manually */
+                        BTRFS_I(inode)->location.objectid = inode->i_ino;
+                        BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+                        BTRFS_I(inode)->location.offset = 0;
+                        btrfs_read_locked_inode(inode);
+                        unlock_new_inode(inode);
+                }
+                /*
+                 * add this inode to the orphan list so btrfs_orphan_del does
+                 * the proper thing when we hit it
+                 */
+                spin_lock(&root->list_lock);
+                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+                spin_unlock(&root->list_lock);
+                /*
+                 * if this is a bad inode, means we actually succeeded in
+                 * removing the inode, but not the orphan record, which means
+                 * we need to manually delete the orphan since iput will just
+                 * do a destroy_inode
+                 */
+                if (is_bad_inode(inode)) {
+                        trans = btrfs_start_transaction(root, 1);
+                        btrfs_orphan_del(trans, inode);
+                        btrfs_end_transaction(trans, root);
+                        iput(inode);
+                        continue;
+                }
+                /* if we have links, this was a truncate, lets do that */
+                if (inode->i_nlink) {
+                        nr_truncate++;
+                        btrfs_truncate(inode);
+                } else {
+                        nr_unlink++;
+                }
+                /* this will do delete_inode and everything for us */
+                iput(inode);
+        }
+        if (nr_unlink)
+                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+        if (nr_truncate)
+                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+        btrfs_free_path(path);
+}
+/*
+ * read an inode from the btree into the in-memory inode
+ */
+void btrfs_read_locked_inode(struct inode *inode)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_inode_item *inode_item;
+        struct btrfs_timespec *tspec;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_key location;
+        u64 alloc_group_block;
+        u32 rdev;
+        int ret;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+        if (ret)
+                goto make_bad;
+        leaf = path->nodes[0];
+        inode_item = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_inode_item);
+        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+        inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+        inode->i_uid = btrfs_inode_uid(leaf, inode_item);
+        inode->i_gid = btrfs_inode_gid(leaf, inode_item);
+        btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+        tspec = btrfs_inode_atime(inode_item);
+        inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+        inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+        tspec = btrfs_inode_mtime(inode_item);
+        inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+        inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+        tspec = btrfs_inode_ctime(inode_item);
+        inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+        inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
+        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+        BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+        inode->i_generation = BTRFS_I(inode)->generation;
+        inode->i_rdev = 0;
+        rdev = btrfs_inode_rdev(leaf, inode_item);
+        BTRFS_I(inode)->index_cnt = (u64)-1;
+        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+        alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+                                                alloc_group_block, 0);
+        btrfs_free_path(path);
+        inode_item = NULL;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_mapping->a_ops = &btrfs_aops;
+                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+                inode->i_fop = &btrfs_file_operations;
+                inode->i_op = &btrfs_file_inode_operations;
+                break;
+        case S_IFDIR:
+                inode->i_fop = &btrfs_dir_file_operations;
+                if (root == root->fs_info->tree_root)
+                        inode->i_op = &btrfs_dir_ro_inode_operations;
+                else
+                        inode->i_op = &btrfs_dir_inode_operations;
+                break;
+        case S_IFLNK:
+                inode->i_op = &btrfs_symlink_inode_operations;
+                inode->i_mapping->a_ops = &btrfs_symlink_aops;
+                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+                break;
+        default:
+                init_special_inode(inode, inode->i_mode, rdev);
+                break;
+        }
+        return;
+make_bad:
+        btrfs_free_path(path);
+        make_bad_inode(inode);
+}
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+                            struct extent_buffer *leaf,
+                            struct btrfs_inode_item *item,
+                            struct inode *inode)
+{
+        btrfs_set_inode_uid(leaf, item, inode->i_uid);
+        btrfs_set_inode_gid(leaf, item, inode->i_gid);
+        btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+        btrfs_set_inode_mode(leaf, item, inode->i_mode);
+        btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+        btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+                               inode->i_atime.tv_sec);
+        btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+                                inode->i_atime.tv_nsec);
+        btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+                               inode->i_mtime.tv_sec);
+        btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+                                inode->i_mtime.tv_nsec);
+        btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+                               inode->i_ctime.tv_sec);
+        btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+                                inode->i_ctime.tv_nsec);
+        btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+        btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+        btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+        btrfs_set_inode_transid(leaf, item, trans->transid);
+        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+        btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+}
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode)
+{
+        struct btrfs_inode_item *inode_item;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_lookup_inode(trans, root, path,
+                                 &BTRFS_I(inode)->location, 1);
+        if (ret) {
+                if (ret > 0)
+                        ret = -ENOENT;
+                goto failed;
+        }
+        leaf = path->nodes[0];
+        inode_item = btrfs_item_ptr(leaf, path->slots[0],
+                                  struct btrfs_inode_item);
+        fill_inode_item(trans, leaf, inode_item, inode);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_set_inode_last_trans(trans, inode);
+        ret = 0;
+failed:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code.  It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct inode *dir, struct inode *inode,
+                       const char *name, int name_len)
+{
+        struct btrfs_path *path;
+        int ret = 0;
+        struct extent_buffer *leaf;
+        struct btrfs_dir_item *di;
+        struct btrfs_key key;
+        u64 index;
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                    name, name_len, -1);
+        if (IS_ERR(di)) {
+                ret = PTR_ERR(di);
+                goto err;
+        }
+        if (!di) {
+                ret = -ENOENT;
+                goto err;
+        }
+        leaf = path->nodes[0];
+        btrfs_dir_item_key_to_cpu(leaf, di, &key);
+        ret = btrfs_delete_one_dir_name(trans, root, path, di);
+        if (ret)
+                goto err;
+        btrfs_release_path(root, path);
+        ret = btrfs_del_inode_ref(trans, root, name, name_len,
+                                  inode->i_ino,
+                                  dir->i_ino, &index);
+        if (ret) {
+                printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
+                       "inode %lu parent %lu\n", name_len, name,
+                       inode->i_ino, dir->i_ino);
+                goto err;
+        }
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+                                         index, name, name_len, -1);
+        if (IS_ERR(di)) {
+                ret = PTR_ERR(di);
+                goto err;
+        }
+        if (!di) {
+                ret = -ENOENT;
+                goto err;
+        }
+        ret = btrfs_delete_one_dir_name(trans, root, path, di);
+        btrfs_release_path(root, path);
+        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+                                         inode, dir->i_ino);
+        BUG_ON(ret != 0 && ret != -ENOENT);
+        if (ret != -ENOENT)
+                BTRFS_I(dir)->log_dirty_trans = trans->transid;
+        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+                                           dir, index);
+        BUG_ON(ret);
+err:
+        btrfs_free_path(path);
+        if (ret)
+                goto out;
+        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+        btrfs_update_inode(trans, root, dir);
+        btrfs_drop_nlink(inode);
+        ret = btrfs_update_inode(trans, root, inode);
+        dir->i_sb->s_dirt = 1;
+out:
+        return ret;
+}
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct btrfs_root *root;
+        struct btrfs_trans_handle *trans;
+        struct inode *inode = dentry->d_inode;
+        int ret;
+        unsigned long nr = 0;
+        root = BTRFS_I(dir)->root;
+        ret = btrfs_check_free_space(root, 1, 1);
+        if (ret)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+                                 dentry->d_name.name, dentry->d_name.len);
+        if (inode->i_nlink == 0)
+                ret = btrfs_orphan_add(trans, inode);
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+fail:
+        btrfs_btree_balance_dirty(root, nr);
+        return ret;
+}
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode = dentry->d_inode;
+        int err = 0;
+        int ret;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr = 0;
+        /*
+         * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
+         * the root of a subvolume or snapshot
+         */
+        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+                return -ENOTEMPTY;
+        }
+        ret = btrfs_check_free_space(root, 1, 1);
+        if (ret)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_orphan_add(trans, inode);
+        if (err)
+                goto fail_trans;
+        /* now the directory is empty */
+        err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+                                 dentry->d_name.name, dentry->d_name.len);
+        if (!err)
+                btrfs_i_size_write(inode, 0);
+fail_trans:
+        nr = trans->blocks_used;
+        ret = btrfs_end_transaction_throttle(trans, root);
+fail:
+        btrfs_btree_balance_dirty(root, nr);
+        if (ret && !err)
+                err = ret;
+        return err;
+}
+#if 0
+/*
+ * when truncating bytes in a file, it is possible to avoid reading
+ * the leaves that contain only checksum items.  This can be the
+ * majority of the IO required to delete a large file, but it must
+ * be done carefully.
+ *
+ * The keys in the level just above the leaves are checked to make sure
+ * the lowest key in a given leaf is a csum key, and starts at an offset
+ * after the new  size.
+ *
+ * Then the key for the next leaf is checked to make sure it also has
+ * a checksum item for the same file.  If it does, we know our target leaf
+ * contains only checksum items, and it can be safely freed without reading
+ * it.
+ *
+ * This is just an optimization targeted at large files.  It may do
+ * nothing.  It will return 0 unless things went badly.
+ */
+static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     struct btrfs_path *path,
+                                     struct inode *inode, u64 new_size)
+{
+        struct btrfs_key key;
+        int ret;
+        int nritems;
+        struct btrfs_key found_key;
+        struct btrfs_key other_key;
+        struct btrfs_leaf_ref *ref;
+        u64 leaf_gen;
+        u64 leaf_start;
+        path->lowest_level = 1;
+        key.objectid = inode->i_ino;
+        key.type = BTRFS_CSUM_ITEM_KEY;
+        key.offset = new_size;
+again:
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
+        if (path->nodes[1] == NULL) {
+                ret = 0;
+                goto out;
+        }
+        ret = 0;
+        btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
+        nritems = btrfs_header_nritems(path->nodes[1]);
+        if (!nritems)
+                goto out;
+        if (path->slots[1] >= nritems)
+                goto next_node;
+        /* did we find a key greater than anything we want to delete? */
+        if (found_key.objectid > inode->i_ino ||
+           (found_key.objectid == inode->i_ino && found_key.type > key.type))
+                goto out;
+        /* we check the next key in the node to make sure the leave contains
+         * only checksum items.  This comparison doesn't work if our
+         * leaf is the last one in the node
+         */
+        if (path->slots[1] + 1 >= nritems) {
+next_node:
+                /* search forward from the last key in the node, this
+                 * will bring us into the next node in the tree
+                 */
+                btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
+                /* unlikely, but we inc below, so check to be safe */
+                if (found_key.offset == (u64)-1)
+                        goto out;
+                /* search_forward needs a path with locks held, do the
+                 * search again for the original key.  It is possible
+                 * this will race with a balance and return a path that
+                 * we could modify, but this drop is just an optimization
+                 * and is allowed to miss some leaves.
+                 */
+                btrfs_release_path(root, path);
+                found_key.offset++;
+                /* setup a max key for search_forward */
+                other_key.offset = (u64)-1;
+                other_key.type = key.type;
+                other_key.objectid = key.objectid;
+                path->keep_locks = 1;
+                ret = btrfs_search_forward(root, &found_key, &other_key,
+                                           path, 0, 0);
+                path->keep_locks = 0;
+                if (ret || found_key.objectid != key.objectid ||
+                    found_key.type != key.type) {
+                        ret = 0;
+                        goto out;
+                }
+                key.offset = found_key.offset;
+                btrfs_release_path(root, path);
+                cond_resched();
+                goto again;
+        }
+        /* we know there's one more slot after us in the tree,
+         * read that key so we can verify it is also a checksum item
+         */
+        btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
+        if (found_key.objectid < inode->i_ino)
+                goto next_key;
+        if (found_key.type != key.type || found_key.offset < new_size)
+                goto next_key;
+        /*
+         * if the key for the next leaf isn't a csum key from this objectid,
+         * we can't be sure there aren't good items inside this leaf.
+         * Bail out
+         */
+        if (other_key.objectid != inode->i_ino || other_key.type != key.type)
+                goto out;
+        leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
+        leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
+        /*
+         * it is safe to delete this leaf, it contains only
+         * csum items from this inode at an offset >= new_size
+         */
+        ret = btrfs_del_leaf(trans, root, path, leaf_start);
+        BUG_ON(ret);
+        if (root->ref_cows && leaf_gen < trans->transid) {
+                ref = btrfs_alloc_leaf_ref(root, 0);
+                if (ref) {
+                        ref->root_gen = root->root_key.offset;
+                        ref->bytenr = leaf_start;
+                        ref->owner = 0;
+                        ref->generation = leaf_gen;
+                        ref->nritems = 0;
+                        ret = btrfs_add_leaf_ref(root, ref, 0);
+                        WARN_ON(ret);
+                        btrfs_free_leaf_ref(root, ref);
+                } else {
+                        WARN_ON(1);
+                }
+        }
+next_key:
+        btrfs_release_path(root, path);
+        if (other_key.objectid == inode->i_ino &&
+            other_key.type == key.type && other_key.offset > key.offset) {
+                key.offset = other_key.offset;
+                cond_resched();
+                goto again;
+        }
+        ret = 0;
+out:
+        /* fixup any changes we've made to the path */
+        path->lowest_level = 0;
+        path->keep_locks = 0;
+        btrfs_release_path(root, path);
+        return ret;
+}
+#endif
+/*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+ * any higher than new_size
+ *
+ * csum items that cross the new i_size are truncated to the new size
+ * as well.
+ *
+ * min_type is the minimum key type to truncate down to.  If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
+ */
+noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct inode *inode,
+                                        u64 new_size, u32 min_type)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        u32 found_type;
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *fi;
+        u64 extent_start = 0;
+        u64 extent_num_bytes = 0;
+        u64 item_end = 0;
+        u64 root_gen = 0;
+        u64 root_owner = 0;
+        int found_extent;
+        int del_item;
+        int pending_del_nr = 0;
+        int pending_del_slot = 0;
+        int extent_type = -1;
+        int encoding;
+        u64 mask = root->sectorsize - 1;
+        if (root->ref_cows)
+                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+        path = btrfs_alloc_path();
+        path->reada = -1;
+        BUG_ON(!path);
+        /* FIXME, add redo link to tree so we don't leak on crash */
+        key.objectid = inode->i_ino;
+        key.offset = (u64)-1;
+        key.type = (u8)-1;
+        btrfs_init_path(path);
+search_again:
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto error;
+        if (ret > 0) {
+                /* there are no items in the tree for us to truncate, we're
+                 * done
+                 */
+                if (path->slots[0] == 0) {
+                        ret = 0;
+                        goto error;
+                }
+                path->slots[0]--;
+        }
+        while (1) {
+                fi = NULL;
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                found_type = btrfs_key_type(&found_key);
+                encoding = 0;
+                if (found_key.objectid != inode->i_ino)
+                        break;
+                if (found_type < min_type)
+                        break;
+                item_end = found_key.offset;
+                if (found_type == BTRFS_EXTENT_DATA_KEY) {
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        extent_type = btrfs_file_extent_type(leaf, fi);
+                        encoding = btrfs_file_extent_compression(leaf, fi);
+                        encoding |= btrfs_file_extent_encryption(leaf, fi);
+                        encoding |= btrfs_file_extent_other_encoding(leaf, fi);
+                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+                                item_end +=
+                                    btrfs_file_extent_num_bytes(leaf, fi);
+                        } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                                item_end += btrfs_file_extent_inline_len(leaf,
+                                                                         fi);
+                        }
+                        item_end--;
+                }
+                if (item_end < new_size) {
+                        if (found_type == BTRFS_DIR_ITEM_KEY)
+                                found_type = BTRFS_INODE_ITEM_KEY;
+                        else if (found_type == BTRFS_EXTENT_ITEM_KEY)
+                                found_type = BTRFS_EXTENT_DATA_KEY;
+                        else if (found_type == BTRFS_EXTENT_DATA_KEY)
+                                found_type = BTRFS_XATTR_ITEM_KEY;
+                        else if (found_type == BTRFS_XATTR_ITEM_KEY)
+                                found_type = BTRFS_INODE_REF_KEY;
+                        else if (found_type)
+                                found_type--;
+                        else
+                                break;
+                        btrfs_set_key_type(&key, found_type);
+                        goto next;
+                }
+                if (found_key.offset >= new_size)
+                        del_item = 1;
+                else
+                        del_item = 0;
+                found_extent = 0;
+                /* FIXME, shrink the extent if the ref count is only 1 */
+                if (found_type != BTRFS_EXTENT_DATA_KEY)
+                        goto delete;
+                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
+                        u64 num_dec;
+                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
+                        if (!del_item && !encoding) {
+                                u64 orig_num_bytes =
+                                        btrfs_file_extent_num_bytes(leaf, fi);
+                                extent_num_bytes = new_size -
+                                        found_key.offset + root->sectorsize - 1;
+                                extent_num_bytes = extent_num_bytes &
+                                        ~((u64)root->sectorsize - 1);
+                                btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                         extent_num_bytes);
+                                num_dec = (orig_num_bytes -
+                                           extent_num_bytes);
+                                if (root->ref_cows && extent_start != 0)
+                                        inode_sub_bytes(inode, num_dec);
+                                btrfs_mark_buffer_dirty(leaf);
+                        } else {
+                                extent_num_bytes =
+                                        btrfs_file_extent_disk_num_bytes(leaf,
+                                                                         fi);
+                                /* FIXME blocksize != 4096 */
+                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
+                                if (extent_start != 0) {
+                                        found_extent = 1;
+                                        if (root->ref_cows)
+                                                inode_sub_bytes(inode, num_dec);
+                                }
+                                root_gen = btrfs_header_generation(leaf);
+                                root_owner = btrfs_header_owner(leaf);
+                        }
+                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                        /*
+                         * we can't truncate inline items that have had
+                         * special encodings
+                         */
+                        if (!del_item &&
+                            btrfs_file_extent_compression(leaf, fi) == 0 &&
+                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
+                            btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+                                u32 size = new_size - found_key.offset;
+                                if (root->ref_cows) {
+                                        inode_sub_bytes(inode, item_end + 1 -
+                                                        new_size);
+                                }
+                                size =
+                                    btrfs_file_extent_calc_inline_size(size);
+                                ret = btrfs_truncate_item(trans, root, path,
+                                                          size, 1);
+                                BUG_ON(ret);
+                        } else if (root->ref_cows) {
+                                inode_sub_bytes(inode, item_end + 1 -
+                                                found_key.offset);
+                        }
+                }
+delete:
+                if (del_item) {
+                        if (!pending_del_nr) {
+                                /* no pending yet, add ourselves */
+                                pending_del_slot = path->slots[0];
+                                pending_del_nr = 1;
+                        } else if (pending_del_nr &&
+                                   path->slots[0] + 1 == pending_del_slot) {
+                                /* hop on the pending chunk */
+                                pending_del_nr++;
+                                pending_del_slot = path->slots[0];
+                        } else {
+                                BUG();
+                        }
+                } else {
+                        break;
+                }
+                if (found_extent) {
+                        ret = btrfs_free_extent(trans, root, extent_start,
+                                                extent_num_bytes,
+                                                leaf->start, root_owner,
+                                                root_gen, inode->i_ino, 0);
+                        BUG_ON(ret);
+                }
+next:
+                if (path->slots[0] == 0) {
+                        if (pending_del_nr)
+                                goto del_pending;
+                        btrfs_release_path(root, path);
+                        goto search_again;
+                }
+                path->slots[0]--;
+                if (pending_del_nr &&
+                    path->slots[0] + 1 != pending_del_slot) {
+                        struct btrfs_key debug;
+del_pending:
+                        btrfs_item_key_to_cpu(path->nodes[0], &debug,
+                                              pending_del_slot);
+                        ret = btrfs_del_items(trans, root, path,
+                                              pending_del_slot,
+                                              pending_del_nr);
+                        BUG_ON(ret);
+                        pending_del_nr = 0;
+                        btrfs_release_path(root, path);
+                        goto search_again;
+                }
+        }
+        ret = 0;
+error:
+        if (pending_del_nr) {
+                ret = btrfs_del_items(trans, root, path, pending_del_slot,
+                                      pending_del_nr);
+        }
+        btrfs_free_path(path);
+        inode->i_sb->s_dirt = 1;
+        return ret;
+}
+/*
+ * taken from block_truncate_page, but does cow as it zeros out
+ * any bytes left in the last page in the file.
+ */
+static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+{
+        struct inode *inode = mapping->host;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_ordered_extent *ordered;
+        char *kaddr;
+        u32 blocksize = root->sectorsize;
+        pgoff_t index = from >> PAGE_CACHE_SHIFT;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        struct page *page;
+        int ret = 0;
+        u64 page_start;
+        u64 page_end;
+        if ((offset & (blocksize - 1)) == 0)
+                goto out;
+        ret = -ENOMEM;
+again:
+        page = grab_cache_page(mapping, index);
+        if (!page)
+                goto out;
+        page_start = page_offset(page);
+        page_end = page_start + PAGE_CACHE_SIZE - 1;
+        if (!PageUptodate(page)) {
+                ret = btrfs_readpage(NULL, page);
+                lock_page(page);
+                if (page->mapping != mapping) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        goto again;
+                }
+                if (!PageUptodate(page)) {
+                        ret = -EIO;
+                        goto out_unlock;
+                }
+        }
+        wait_on_page_writeback(page);
+        lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+        set_page_extent_mapped(page);
+        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+        if (ordered) {
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_page(page);
+                page_cache_release(page);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+                goto again;
+        }
+        btrfs_set_extent_delalloc(inode, page_start, page_end);
+        ret = 0;
+        if (offset != PAGE_CACHE_SIZE) {
+                kaddr = kmap(page);
+                memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+                flush_dcache_page(page);
+                kunmap(page);
+        }
+        ClearPageChecked(page);
+        set_page_dirty(page);
+        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+out_unlock:
+        unlock_page(page);
+        page_cache_release(page);
+out:
+        return ret;
+}
+int btrfs_cont_expand(struct inode *inode, loff_t size)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct extent_map *em;
+        u64 mask = root->sectorsize - 1;
+        u64 hole_start = (inode->i_size + mask) & ~mask;
+        u64 block_end = (size + mask) & ~mask;
+        u64 last_byte;
+        u64 cur_offset;
+        u64 hole_size;
+        int err;
+        if (size <= hole_start)
+                return 0;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                return err;
+        btrfs_truncate_page(inode->i_mapping, inode->i_size);
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                btrfs_wait_ordered_range(inode, hole_start,
+                                         block_end - hole_start);
+                lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+                if (!ordered)
+                        break;
+                unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+                btrfs_put_ordered_extent(ordered);
+        }
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        cur_offset = hole_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                block_end - cur_offset, 0);
+                BUG_ON(IS_ERR(em) || !em);
+                last_byte = min(extent_map_end(em), block_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+                        u64 hint_byte = 0;
+                        hole_size = last_byte - cur_offset;
+                        err = btrfs_drop_extents(trans, root, inode,
+                                                 cur_offset,
+                                                 cur_offset + hole_size,
+                                                 cur_offset, &hint_byte);
+                        if (err)
+                                break;
+                        err = btrfs_insert_file_extent(trans, root,
+                                        inode->i_ino, cur_offset, 0,
+                                        0, hole_size, 0, hole_size,
+                                        0, 0, 0);
+                        btrfs_drop_extent_cache(inode, hole_start,
+                                        last_byte - 1, 0);
+                }
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (err || cur_offset >= block_end)
+                        break;
+        }
+        btrfs_end_transaction(trans, root);
+        unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+        return err;
+}
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        struct inode *inode = dentry->d_inode;
+        int err;
+        err = inode_change_ok(inode, attr);
+        if (err)
+                return err;
+        if (S_ISREG(inode->i_mode) &&
+            attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+                err = btrfs_cont_expand(inode, attr->ia_size);
+                if (err)
+                        return err;
+        }
+        err = inode_setattr(inode, attr);
+        if (!err && ((attr->ia_valid & ATTR_MODE)))
+                err = btrfs_acl_chmod(inode);
+        return err;
+}
+void btrfs_delete_inode(struct inode *inode)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        unsigned long nr;
+        int ret;
+        truncate_inode_pages(&inode->i_data, 0);
+        if (is_bad_inode(inode)) {
+                btrfs_orphan_del(NULL, inode);
+                goto no_delete;
+        }
+        btrfs_wait_ordered_range(inode, 0, (u64)-1);
+        btrfs_i_size_write(inode, 0);
+        trans = btrfs_join_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
+        if (ret) {
+                btrfs_orphan_del(NULL, inode);
+                goto no_delete_lock;
+        }
+        btrfs_orphan_del(trans, inode);
+        nr = trans->blocks_used;
+        clear_inode(inode);
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        return;
+no_delete_lock:
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+no_delete:
+        clear_inode(inode);
+}
+/*
+ * this returns the key found in the dir entry in the location pointer.
+ * If no dir entries were found, location->objectid is 0.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+                               struct btrfs_key *location)
+{
+        const char *name = dentry->d_name.name;
+        int namelen = dentry->d_name.len;
+        struct btrfs_dir_item *di;
+        struct btrfs_path *path;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        int ret = 0;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+                                    namelen, 0);
+        if (IS_ERR(di))
+                ret = PTR_ERR(di);
+        if (!di || IS_ERR(di))
+                goto out_err;
+        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
+out:
+        btrfs_free_path(path);
+        return ret;
+out_err:
+        location->objectid = 0;
+        goto out;
+}
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root.  This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_root *root,
+                             struct btrfs_key *location,
+                             struct btrfs_root **sub_root,
+                             struct dentry *dentry)
+{
+        struct btrfs_root_item *ri;
+        if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+                return 0;
+        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+                return 0;
+        *sub_root = btrfs_read_fs_root(root->fs_info, location,
+                                        dentry->d_name.name,
+                                        dentry->d_name.len);
+        if (IS_ERR(*sub_root))
+                return PTR_ERR(*sub_root);
+        ri = &(*sub_root)->root_item;
+        location->objectid = btrfs_root_dirid(ri);
+        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+        location->offset = 0;
+        return 0;
+}
+static noinline void init_btrfs_i(struct inode *inode)
+{
+        struct btrfs_inode *bi = BTRFS_I(inode);
+        bi->i_acl = NULL;
+        bi->i_default_acl = NULL;
+        bi->generation = 0;
+        bi->sequence = 0;
+        bi->last_trans = 0;
+        bi->logged_trans = 0;
+        bi->delalloc_bytes = 0;
+        bi->disk_i_size = 0;
+        bi->flags = 0;
+        bi->index_cnt = (u64)-1;
+        bi->log_dirty_trans = 0;
+        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
+                             inode->i_mapping, GFP_NOFS);
+        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+                             inode->i_mapping, GFP_NOFS);
+        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+        mutex_init(&BTRFS_I(inode)->extent_mutex);
+        mutex_init(&BTRFS_I(inode)->log_mutex);
+}
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+        struct btrfs_iget_args *args = p;
+        inode->i_ino = args->ino;
+        init_btrfs_i(inode);
+        BTRFS_I(inode)->root = args->root;
+        return 0;
+}
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+        struct btrfs_iget_args *args = opaque;
+        return args->ino == inode->i_ino &&
+                args->root == BTRFS_I(inode)->root;
+}
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+                            struct btrfs_root *root, int wait)
+{
+        struct inode *inode;
+        struct btrfs_iget_args args;
+        args.ino = objectid;
+        args.root = root;
+        if (wait) {
+                inode = ilookup5(s, objectid, btrfs_find_actor,
+                                 (void *)&args);
+        } else {
+                inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
+                                        (void *)&args);
+        }
+        return inode;
+}
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+                                struct btrfs_root *root)
+{
+        struct inode *inode;
+        struct btrfs_iget_args args;
+        args.ino = objectid;
+        args.root = root;
+        inode = iget5_locked(s, objectid, btrfs_find_actor,
+                             btrfs_init_locked_inode,
+                             (void *)&args);
+        return inode;
+}
+/* Get an inode object given its location and corresponding root.
+ * Returns in *is_new if the inode was read from disk
+ */
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+                         struct btrfs_root *root, int *is_new)
+{
+        struct inode *inode;
+        inode = btrfs_iget_locked(s, location->objectid, root);
+        if (!inode)
+                return ERR_PTR(-EACCES);
+        if (inode->i_state & I_NEW) {
+                BTRFS_I(inode)->root = root;
+                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
+                btrfs_read_locked_inode(inode);
+                unlock_new_inode(inode);
+                if (is_new)
+                        *is_new = 1;
+        } else {
+                if (is_new)
+                        *is_new = 0;
+        }
+        return inode;
+}
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+{
+        struct inode *inode;
+        struct btrfs_inode *bi = BTRFS_I(dir);
+        struct btrfs_root *root = bi->root;
+        struct btrfs_root *sub_root = root;
+        struct btrfs_key location;
+        int ret, new;
+        if (dentry->d_name.len > BTRFS_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        ret = btrfs_inode_by_name(dir, dentry, &location);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        inode = NULL;
+        if (location.objectid) {
+                ret = fixup_tree_root_location(root, &location, &sub_root,
+                                                dentry);
+                if (ret < 0)
+                        return ERR_PTR(ret);
+                if (ret > 0)
+                        return ERR_PTR(-ENOENT);
+                inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+                if (IS_ERR(inode))
+                        return ERR_CAST(inode);
+        }
+        return inode;
+}
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+                                   struct nameidata *nd)
+{
+        struct inode *inode;
+        if (dentry->d_name.len > BTRFS_NAME_LEN)
+                return ERR_PTR(-ENAMETOOLONG);
+        inode = btrfs_lookup_dentry(dir, dentry);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        return d_splice_alias(inode, dentry);
+}
+static unsigned char btrfs_filetype_table[] = {
+        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+static int btrfs_real_readdir(struct file *filp, void *dirent,
+                              filldir_t filldir)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_item *item;
+        struct btrfs_dir_item *di;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_path *path;
+        int ret;
+        u32 nritems;
+        struct extent_buffer *leaf;
+        int slot;
+        int advance;
+        unsigned char d_type;
+        int over = 0;
+        u32 di_cur;
+        u32 di_total;
+        u32 di_len;
+        int key_type = BTRFS_DIR_INDEX_KEY;
+        char tmp_name[32];
+        char *name_ptr;
+        int name_len;
+        /* FIXME, use a real flag for deciding about the key type */
+        if (root->fs_info->tree_root == root)
+                key_type = BTRFS_DIR_ITEM_KEY;
+        /* special case for "." */
+        if (filp->f_pos == 0) {
+                over = filldir(dirent, ".", 1,
+                               1, inode->i_ino,
+                               DT_DIR);
+                if (over)
+                        return 0;
+                filp->f_pos = 1;
+        }
+        /* special case for .., just use the back ref */
+        if (filp->f_pos == 1) {
+                u64 pino = parent_ino(filp->f_path.dentry);
+                over = filldir(dirent, "..", 2,
+                               2, pino, DT_DIR);
+                if (over)
+                        return 0;
+                filp->f_pos = 2;
+        }
+        path = btrfs_alloc_path();
+        path->reada = 2;
+        btrfs_set_key_type(&key, key_type);
+        key.offset = filp->f_pos;
+        key.objectid = inode->i_ino;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto err;
+        advance = 0;
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                slot = path->slots[0];
+                if (advance || slot >= nritems) {
+                        if (slot >= nritems - 1) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret)
+                                        break;
+                                leaf = path->nodes[0];
+                                nritems = btrfs_header_nritems(leaf);
+                                slot = path->slots[0];
+                        } else {
+                                slot++;
+                                path->slots[0]++;
+                        }
+                }
+                advance = 1;
+                item = btrfs_item_nr(leaf, slot);
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (found_key.objectid != key.objectid)
+                        break;
+                if (btrfs_key_type(&found_key) != key_type)
+                        break;
+                if (found_key.offset < filp->f_pos)
+                        continue;
+                filp->f_pos = found_key.offset;
+                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+                di_cur = 0;
+                di_total = btrfs_item_size(leaf, item);
+                while (di_cur < di_total) {
+                        struct btrfs_key location;
+                        name_len = btrfs_dir_name_len(leaf, di);
+                        if (name_len <= sizeof(tmp_name)) {
+                                name_ptr = tmp_name;
+                        } else {
+                                name_ptr = kmalloc(name_len, GFP_NOFS);
+                                if (!name_ptr) {
+                                        ret = -ENOMEM;
+                                        goto err;
+                                }
+                        }
+                        read_extent_buffer(leaf, name_ptr,
+                                           (unsigned long)(di + 1), name_len);
+                        d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+                        btrfs_dir_item_key_to_cpu(leaf, di, &location);
+                        /* is this a reference to our own snapshot? If so
+                         * skip it
+                         */
+                        if (location.type == BTRFS_ROOT_ITEM_KEY &&
+                            location.objectid == root->root_key.objectid) {
+                                over = 0;
+                                goto skip;
+                        }
+                        over = filldir(dirent, name_ptr, name_len,
+                                       found_key.offset, location.objectid,
+                                       d_type);
+skip:
+                        if (name_ptr != tmp_name)
+                                kfree(name_ptr);
+                        if (over)
+                                goto nopos;
+                        di_len = btrfs_dir_name_len(leaf, di) +
+                                 btrfs_dir_data_len(leaf, di) + sizeof(*di);
+                        di_cur += di_len;
+                        di = (struct btrfs_dir_item *)((char *)di + di_len);
+                }
+        }
+        /* Reached end of directory/root. Bump pos past the last item. */
+        if (key_type == BTRFS_DIR_INDEX_KEY)
+                filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
+        else
+                filp->f_pos++;
+nopos:
+        ret = 0;
+err:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_write_inode(struct inode *inode, int wait)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        if (root->fs_info->btree_inode == inode)
+                return 0;
+        if (wait) {
+                trans = btrfs_join_transaction(root, 1);
+                btrfs_set_trans_block_group(trans, inode);
+                ret = btrfs_commit_transaction(trans, root);
+        }
+        return ret;
+}
+/*
+ * This is somewhat expensive, updating the tree every time the
+ * inode changes.  But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+void btrfs_dirty_inode(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        trans = btrfs_join_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        btrfs_update_inode(trans, root, inode);
+        btrfs_end_transaction(trans, root);
+}
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_key key, found_key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        int ret;
+        key.objectid = inode->i_ino;
+        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+        key.offset = (u64)-1;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        /* FIXME: we should be able to handle this */
+        if (ret == 0)
+                goto out;
+        ret = 0;
+        /*
+         * MAGIC NUMBER EXPLANATION:
+         * since we search a directory based on f_pos we have to start at 2
+         * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+         * else has to start at 2
+         */
+        if (path->slots[0] == 0) {
+                BTRFS_I(inode)->index_cnt = 2;
+                goto out;
+        }
+        path->slots[0]--;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        if (found_key.objectid != inode->i_ino ||
+            btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+                BTRFS_I(inode)->index_cnt = 2;
+                goto out;
+        }
+        BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to find a free sequence number in a given directory.  This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
+int btrfs_set_inode_index(struct inode *dir, u64 *index)
+{
+        int ret = 0;
+        if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+                ret = btrfs_set_inode_index_count(dir);
+                if (ret)
+                        return ret;
+        }
+        *index = BTRFS_I(dir)->index_cnt;
+        BTRFS_I(dir)->index_cnt++;
+        return ret;
+}
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     struct inode *dir,
+                                     const char *name, int name_len,
+                                     u64 ref_objectid, u64 objectid,
+                                     u64 alloc_hint, int mode, u64 *index)
+{
+        struct inode *inode;
+        struct btrfs_inode_item *inode_item;
+        struct btrfs_key *location;
+        struct btrfs_path *path;
+        struct btrfs_inode_ref *ref;
+        struct btrfs_key key[2];
+        u32 sizes[2];
+        unsigned long ptr;
+        int ret;
+        int owner;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        inode = new_inode(root->fs_info->sb);
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (dir) {
+                ret = btrfs_set_inode_index(dir, index);
+                if (ret)
+                        return ERR_PTR(ret);
+        }
+        /*
+         * index_cnt is ignored for everything but a dir,
+         * btrfs_get_inode_index_count has an explanation for the magic
+         * number
+         */
+        init_btrfs_i(inode);
+        BTRFS_I(inode)->index_cnt = 2;
+        BTRFS_I(inode)->root = root;
+        BTRFS_I(inode)->generation = trans->transid;
+        if (mode & S_IFDIR)
+                owner = 0;
+        else
+                owner = 1;
+        BTRFS_I(inode)->block_group =
+                        btrfs_find_block_group(root, 0, alloc_hint, owner);
+        if ((mode & S_IFREG)) {
+                if (btrfs_test_opt(root, NODATASUM))
+                        btrfs_set_flag(inode, NODATASUM);
+                if (btrfs_test_opt(root, NODATACOW))
+                        btrfs_set_flag(inode, NODATACOW);
+        }
+        key[0].objectid = objectid;
+        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+        key[0].offset = 0;
+        key[1].objectid = objectid;
+        btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+        key[1].offset = ref_objectid;
+        sizes[0] = sizeof(struct btrfs_inode_item);
+        sizes[1] = name_len + sizeof(*ref);
+        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+        if (ret != 0)
+                goto fail;
+        if (objectid > root->highest_inode)
+                root->highest_inode = objectid;
+        inode->i_uid = current_fsuid();
+        inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+        inode->i_ino = objectid;
+        inode_set_bytes(inode, 0);
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                  struct btrfs_inode_item);
+        fill_inode_item(trans, path->nodes[0], inode_item, inode);
+        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+                             struct btrfs_inode_ref);
+        btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+        btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+        ptr = (unsigned long)(ref + 1);
+        write_extent_buffer(path->nodes[0], name, ptr, name_len);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_free_path(path);
+        location = &BTRFS_I(inode)->location;
+        location->objectid = objectid;
+        location->offset = 0;
+        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+        insert_inode_hash(inode);
+        return inode;
+fail:
+        if (dir)
+                BTRFS_I(dir)->index_cnt--;
+        btrfs_free_path(path);
+        return ERR_PTR(ret);
+}
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+        return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+                   struct inode *parent_inode, struct inode *inode,
+                   const char *name, int name_len, int add_backref, u64 index)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+        key.objectid = inode->i_ino;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        ret = btrfs_insert_dir_item(trans, root, name, name_len,
+                                    parent_inode->i_ino,
+                                    &key, btrfs_inode_type(inode),
+                                    index);
+        if (ret == 0) {
+                if (add_backref) {
+                        ret = btrfs_insert_inode_ref(trans, root,
+                                                     name, name_len,
+                                                     inode->i_ino,
+                                                     parent_inode->i_ino,
+                                                     index);
+                }
+                btrfs_i_size_write(parent_inode, parent_inode->i_size +
+                                   name_len * 2);
+                parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+                ret = btrfs_update_inode(trans, root, parent_inode);
+        }
+        return ret;
+}
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+                            struct dentry *dentry, struct inode *inode,
+                            int backref, u64 index)
+{
+        int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+                                 inode, dentry->d_name.name,
+                                 dentry->d_name.len, backref, index);
+        if (!err) {
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        if (err > 0)
+                err = -EEXIST;
+        return err;
+}
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+                        int mode, dev_t rdev)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct inode *inode = NULL;
+        int err;
+        int drop_inode = 0;
+        u64 objectid;
+        unsigned long nr = 0;
+        u64 index = 0;
+        if (!new_valid_dev(rdev))
+                return -EINVAL;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+        if (err) {
+                err = -ENOSPC;
+                goto out_unlock;
+        }
+        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+                                dentry->d_name.len,
+                                dentry->d_parent->d_inode->i_ino, objectid,
+                                BTRFS_I(dir)->block_group, mode, &index);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_unlock;
+        err = btrfs_init_acl(inode, dir);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
+        btrfs_set_trans_block_group(trans, inode);
+        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        if (err)
+                drop_inode = 1;
+        else {
+                inode->i_op = &btrfs_special_inode_operations;
+                init_special_inode(inode, inode->i_mode, rdev);
+                btrfs_update_inode(trans, root, inode);
+        }
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, inode);
+        btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+fail:
+        if (drop_inode) {
+                inode_dec_link_count(inode);
+                iput(inode);
+        }
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+                        int mode, struct nameidata *nd)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct inode *inode = NULL;
+        int err;
+        int drop_inode = 0;
+        unsigned long nr = 0;
+        u64 objectid;
+        u64 index = 0;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+        if (err) {
+                err = -ENOSPC;
+                goto out_unlock;
+        }
+        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+                                dentry->d_name.len,
+                                dentry->d_parent->d_inode->i_ino,
+                                objectid, BTRFS_I(dir)->block_group, mode,
+                                &index);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_unlock;
+        err = btrfs_init_acl(inode, dir);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
+        btrfs_set_trans_block_group(trans, inode);
+        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        if (err)
+                drop_inode = 1;
+        else {
+                inode->i_mapping->a_ops = &btrfs_aops;
+                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+                inode->i_fop = &btrfs_file_operations;
+                inode->i_op = &btrfs_file_inode_operations;
+                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        }
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, inode);
+        btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+fail:
+        if (drop_inode) {
+                inode_dec_link_count(inode);
+                iput(inode);
+        }
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+                      struct dentry *dentry)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct inode *inode = old_dentry->d_inode;
+        u64 index;
+        unsigned long nr = 0;
+        int err;
+        int drop_inode = 0;
+        if (inode->i_nlink == 0)
+                return -ENOENT;
+        btrfs_inc_nlink(inode);
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto fail;
+        err = btrfs_set_inode_index(dir, &index);
+        if (err)
+                goto fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        atomic_inc(&inode->i_count);
+        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+        if (err)
+                drop_inode = 1;
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, dir);
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
+                drop_inode = 1;
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+fail:
+        if (drop_inode) {
+                inode_dec_link_count(inode);
+                iput(inode);
+        }
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+        struct inode *inode = NULL;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        int err = 0;
+        int drop_on_err = 0;
+        u64 objectid = 0;
+        u64 index = 0;
+        unsigned long nr = 1;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto out_unlock;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out_unlock;
+        }
+        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+        if (err) {
+                err = -ENOSPC;
+                goto out_unlock;
+        }
+        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+                                dentry->d_name.len,
+                                dentry->d_parent->d_inode->i_ino, objectid,
+                                BTRFS_I(dir)->block_group, S_IFDIR | mode,
+                                &index);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
+                goto out_fail;
+        }
+        drop_on_err = 1;
+        err = btrfs_init_acl(inode, dir);
+        if (err)
+                goto out_fail;
+        inode->i_op = &btrfs_dir_inode_operations;
+        inode->i_fop = &btrfs_dir_file_operations;
+        btrfs_set_trans_block_group(trans, inode);
+        btrfs_i_size_write(inode, 0);
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
+                goto out_fail;
+        err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+                                 inode, dentry->d_name.name,
+                                 dentry->d_name.len, 0, index);
+        if (err)
+                goto out_fail;
+        d_instantiate(dentry, inode);
+        drop_on_err = 0;
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, inode);
+        btrfs_update_inode_block_group(trans, dir);
+out_fail:
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+out_unlock:
+        if (drop_on_err)
+                iput(inode);
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+/* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the new extent into the tree.
+ */
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+                                struct extent_map *existing,
+                                struct extent_map *em,
+                                u64 map_start, u64 map_len)
+{
+        u64 start_diff;
+        BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+        start_diff = map_start - em->start;
+        em->start = map_start;
+        em->len = map_len;
+        if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+            !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+                em->block_start += start_diff;
+                em->block_len -= start_diff;
+        }
+        return add_extent_mapping(em_tree, em);
+}
+static noinline int uncompress_inline(struct btrfs_path *path,
+                                      struct inode *inode, struct page *page,
+                                      size_t pg_offset, u64 extent_offset,
+                                      struct btrfs_file_extent_item *item)
+{
+        int ret;
+        struct extent_buffer *leaf = path->nodes[0];
+        char *tmp;
+        size_t max_size;
+        unsigned long inline_size;
+        unsigned long ptr;
+        WARN_ON(pg_offset != 0);
+        max_size = btrfs_file_extent_ram_bytes(leaf, item);
+        inline_size = btrfs_file_extent_inline_item_len(leaf,
+                                        btrfs_item_nr(leaf, path->slots[0]));
+        tmp = kmalloc(inline_size, GFP_NOFS);
+        ptr = btrfs_file_extent_inline_start(item);
+        read_extent_buffer(leaf, tmp, ptr, inline_size);
+        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+        ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+                                    inline_size, max_size);
+        if (ret) {
+                char *kaddr = kmap_atomic(page, KM_USER0);
+                unsigned long copy_size = min_t(u64,
+                                  PAGE_CACHE_SIZE - pg_offset,
+                                  max_size - extent_offset);
+                memset(kaddr + pg_offset, 0, copy_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        kfree(tmp);
+        return 0;
+}
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation.  This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+                                    size_t pg_offset, u64 start, u64 len,
+                                    int create)
+{
+        int ret;
+        int err = 0;
+        u64 bytenr;
+        u64 extent_start = 0;
+        u64 extent_end = 0;
+        u64 objectid = inode->i_ino;
+        u32 found_type;
+        struct btrfs_path *path = NULL;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_file_extent_item *item;
+        struct extent_buffer *leaf;
+        struct btrfs_key found_key;
+        struct extent_map *em = NULL;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_trans_handle *trans = NULL;
+        int compressed;
+again:
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, start, len);
+        if (em)
+                em->bdev = root->fs_info->fs_devices->latest_bdev;
+        spin_unlock(&em_tree->lock);
+        if (em) {
+                if (em->start > start || em->start + em->len <= start)
+                        free_extent_map(em);
+                else if (em->block_start == EXTENT_MAP_INLINE && page)
+                        free_extent_map(em);
+                else
+                        goto out;
+        }
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                err = -ENOMEM;
+                goto out;
+        }
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        em->start = EXTENT_MAP_HOLE;
+        em->orig_start = EXTENT_MAP_HOLE;
+        em->len = (u64)-1;
+        em->block_len = (u64)-1;
+        if (!path) {
+                path = btrfs_alloc_path();
+                BUG_ON(!path);
+        }
+        ret = btrfs_lookup_file_extent(trans, root, path,
+                                       objectid, start, trans != NULL);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret != 0) {
+                if (path->slots[0] == 0)
+                        goto not_found;
+                path->slots[0]--;
+        }
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0],
+                              struct btrfs_file_extent_item);
+        /* are we inside the extent that was found? */
+        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        found_type = btrfs_key_type(&found_key);
+        if (found_key.objectid != objectid ||
+            found_type != BTRFS_EXTENT_DATA_KEY) {
+                goto not_found;
+        }
+        found_type = btrfs_file_extent_type(leaf, item);
+        extent_start = found_key.offset;
+        compressed = btrfs_file_extent_compression(leaf, item);
+        if (found_type == BTRFS_FILE_EXTENT_REG ||
+            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                extent_end = extent_start +
+                       btrfs_file_extent_num_bytes(leaf, item);
+        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                size_t size;
+                size = btrfs_file_extent_inline_len(leaf, item);
+                extent_end = (extent_start + size + root->sectorsize - 1) &
+                        ~((u64)root->sectorsize - 1);
+        }
+        if (start >= extent_end) {
+                path->slots[0]++;
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0) {
+                                err = ret;
+                                goto out;
+                        }
+                        if (ret > 0)
+                                goto not_found;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid != objectid ||
+                    found_key.type != BTRFS_EXTENT_DATA_KEY)
+                        goto not_found;
+                if (start + len <= found_key.offset)
+                        goto not_found;
+                em->start = start;
+                em->len = found_key.offset - start;
+                goto not_found_em;
+        }
+        if (found_type == BTRFS_FILE_EXTENT_REG ||
+            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                em->start = extent_start;
+                em->len = extent_end - extent_start;
+                em->orig_start = extent_start -
+                                 btrfs_file_extent_offset(leaf, item);
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+                if (bytenr == 0) {
+                        em->block_start = EXTENT_MAP_HOLE;
+                        goto insert;
+                }
+                if (compressed) {
+                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                        em->block_start = bytenr;
+                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                         item);
+                } else {
+                        bytenr += btrfs_file_extent_offset(leaf, item);
+                        em->block_start = bytenr;
+                        em->block_len = em->len;
+                        if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
+                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                }
+                goto insert;
+        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                unsigned long ptr;
+                char *map;
+                size_t size;
+                size_t extent_offset;
+                size_t copy_size;
+                em->block_start = EXTENT_MAP_INLINE;
+                if (!page || create) {
+                        em->start = extent_start;
+                        em->len = extent_end - extent_start;
+                        goto out;
+                }
+                size = btrfs_file_extent_inline_len(leaf, item);
+                extent_offset = page_offset(page) + pg_offset - extent_start;
+                copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
+                                size - extent_offset);
+                em->start = extent_start + extent_offset;
+                em->len = (copy_size + root->sectorsize - 1) &
+                        ~((u64)root->sectorsize - 1);
+                em->orig_start = EXTENT_MAP_INLINE;
+                if (compressed)
+                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+                if (create == 0 && !PageUptodate(page)) {
+                        if (btrfs_file_extent_compression(leaf, item) ==
+                            BTRFS_COMPRESS_ZLIB) {
+                                ret = uncompress_inline(path, inode, page,
+                                                        pg_offset,
+                                                        extent_offset, item);
+                                BUG_ON(ret);
+                        } else {
+                                map = kmap(page);
+                                read_extent_buffer(leaf, map + pg_offset, ptr,
+                                                   copy_size);
+                                kunmap(page);
+                        }
+                        flush_dcache_page(page);
+                } else if (create && PageUptodate(page)) {
+                        if (!trans) {
+                                kunmap(page);
+                                free_extent_map(em);
+                                em = NULL;
+                                btrfs_release_path(root, path);
+                                trans = btrfs_join_transaction(root, 1);
+                                goto again;
+                        }
+                        map = kmap(page);
+                        write_extent_buffer(leaf, map + pg_offset, ptr,
+                                            copy_size);
+                        kunmap(page);
+                        btrfs_mark_buffer_dirty(leaf);
+                }
+                set_extent_uptodate(io_tree, em->start,
+                                    extent_map_end(em) - 1, GFP_NOFS);
+                goto insert;
+        } else {
+                printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
+                WARN_ON(1);
+        }
+not_found:
+        em->start = start;
+        em->len = len;
+not_found_em:
+        em->block_start = EXTENT_MAP_HOLE;
+        set_bit(EXTENT_FLAG_VACANCY, &em->flags);
+insert:
+        btrfs_release_path(root, path);
+        if (em->start > start || extent_map_end(em) <= start) {
+                printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
+                       "[%llu %llu]\n", (unsigned long long)em->start,
+                       (unsigned long long)em->len,
+                       (unsigned long long)start,
+                       (unsigned long long)len);
+                err = -EIO;
+                goto out;
+        }
+        err = 0;
+        spin_lock(&em_tree->lock);
+        ret = add_extent_mapping(em_tree, em);
+        /* it is possible that someone inserted the extent into the tree
+         * while we had the lock dropped.  It is also possible that
+         * an overlapping map exists in the tree
+         */
+        if (ret == -EEXIST) {
+                struct extent_map *existing;
+                ret = 0;
+                existing = lookup_extent_mapping(em_tree, start, len);
+                if (existing && (existing->start > start ||
+                    existing->start + existing->len <= start)) {
+                        free_extent_map(existing);
+                        existing = NULL;
+                }
+                if (!existing) {
+                        existing = lookup_extent_mapping(em_tree, em->start,
+                                                         em->len);
+                        if (existing) {
+                                err = merge_extent_mapping(em_tree, existing,
+                                                           em, start,
+                                                           root->sectorsize);
+                                free_extent_map(existing);
+                                if (err) {
+                                        free_extent_map(em);
+                                        em = NULL;
+                                }
+                        } else {
+                                err = -EIO;
+                                free_extent_map(em);
+                                em = NULL;
+                        }
+                } else {
+                        free_extent_map(em);
+                        em = existing;
+                        err = 0;
+                }
+        }
+        spin_unlock(&em_tree->lock);
+out:
+        if (path)
+                btrfs_free_path(path);
+        if (trans) {
+                ret = btrfs_end_transaction(trans, root);
+                if (!err)
+                        err = ret;
+        }
+        if (err) {
+                free_extent_map(em);
+                WARN_ON(1);
+                return ERR_PTR(err);
+        }
+        return em;
+}
+static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
+                        const struct iovec *iov, loff_t offset,
+                        unsigned long nr_segs)
+{
+        return -EINVAL;
+}
+static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
+{
+        return extent_bmap(mapping, iblock, btrfs_get_extent);
+}
+int btrfs_readpage(struct file *file, struct page *page)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        return extent_read_full_page(tree, page, btrfs_get_extent);
+}
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+        struct extent_io_tree *tree;
+        if (current->flags & PF_MEMALLOC) {
+                redirty_page_for_writepage(wbc, page);
+                unlock_page(page);
+                return 0;
+        }
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+}
+int btrfs_writepages(struct address_space *mapping,
+                     struct writeback_control *wbc)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(mapping->host)->io_tree;
+        return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+}
+static int
+btrfs_readpages(struct file *file, struct address_space *mapping,
+                struct list_head *pages, unsigned nr_pages)
+{
+        struct extent_io_tree *tree;
+        tree = &BTRFS_I(mapping->host)->io_tree;
+        return extent_readpages(tree, mapping, pages, nr_pages,
+                                btrfs_get_extent);
+}
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+        struct extent_io_tree *tree;
+        struct extent_map_tree *map;
+        int ret;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        map = &BTRFS_I(page->mapping->host)->extent_tree;
+        ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+        if (ret == 1) {
+                ClearPagePrivate(page);
+                set_page_private(page, 0);
+                page_cache_release(page);
+        }
+        return ret;
+}
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+        if (PageWriteback(page) || PageDirty(page))
+                return 0;
+        return __btrfs_releasepage(page, gfp_flags);
+}
+static void btrfs_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct extent_io_tree *tree;
+        struct btrfs_ordered_extent *ordered;
+        u64 page_start = page_offset(page);
+        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+        wait_on_page_writeback(page);
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (offset) {
+                btrfs_releasepage(page, GFP_NOFS);
+                return;
+        }
+        lock_extent(tree, page_start, page_end, GFP_NOFS);
+        ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+                                           page_offset(page));
+        if (ordered) {
+                /*
+                 * IO on this page will never be started, so we need
+                 * to account for any ordered extents now
+                 */
+                clear_extent_bit(tree, page_start, page_end,
+                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 EXTENT_LOCKED, 1, 0, GFP_NOFS);
+                btrfs_finish_ordered_io(page->mapping->host,
+                                        page_start, page_end);
+                btrfs_put_ordered_extent(ordered);
+                lock_extent(tree, page_start, page_end, GFP_NOFS);
+        }
+        clear_extent_bit(tree, page_start, page_end,
+                 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+                 EXTENT_ORDERED,
+                 1, 1, GFP_NOFS);
+        __btrfs_releasepage(page, GFP_NOFS);
+        ClearPageChecked(page);
+        if (PagePrivate(page)) {
+                ClearPagePrivate(page);
+                set_page_private(page, 0);
+                page_cache_release(page);
+        }
+}
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+        struct inode *inode = fdentry(vma->vm_file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_ordered_extent *ordered;
+        char *kaddr;
+        unsigned long zero_start;
+        loff_t size;
+        int ret;
+        u64 page_start;
+        u64 page_end;
+        ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+        if (ret)
+                goto out;
+        ret = -EINVAL;
+again:
+        lock_page(page);
+        size = i_size_read(inode);
+        page_start = page_offset(page);
+        page_end = page_start + PAGE_CACHE_SIZE - 1;
+        if ((page->mapping != inode->i_mapping) ||
+            (page_start >= size)) {
+                /* page got truncated out from underneath us */
+                goto out_unlock;
+        }
+        wait_on_page_writeback(page);
+        lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+        set_page_extent_mapped(page);
+        /*
+         * we can't set the delalloc bits if there are pending ordered
+         * extents.  Drop our locks and wait for them to finish
+         */
+        ordered = btrfs_lookup_ordered_extent(inode, page_start);
+        if (ordered) {
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_page(page);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+                goto again;
+        }
+        btrfs_set_extent_delalloc(inode, page_start, page_end);
+        ret = 0;
+        /* page is wholly or partially inside EOF */
+        if (page_start + PAGE_CACHE_SIZE > size)
+                zero_start = size & ~PAGE_CACHE_MASK;
+        else
+                zero_start = PAGE_CACHE_SIZE;
+        if (zero_start != PAGE_CACHE_SIZE) {
+                kaddr = kmap(page);
+                memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+                flush_dcache_page(page);
+                kunmap(page);
+        }
+        ClearPageChecked(page);
+        set_page_dirty(page);
+        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+out_unlock:
+        unlock_page(page);
+out:
+        return ret;
+}
+static void btrfs_truncate(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr;
+        u64 mask = root->sectorsize - 1;
+        if (!S_ISREG(inode->i_mode))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        btrfs_truncate_page(inode->i_mapping, inode->i_size);
+        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        btrfs_i_size_write(inode, inode->i_size);
+        ret = btrfs_orphan_add(trans, inode);
+        if (ret)
+                goto out;
+        /* FIXME, add redo link to tree so we don't leak on crash */
+        ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
+                                      BTRFS_EXTENT_DATA_KEY);
+        btrfs_update_inode(trans, root, inode);
+        ret = btrfs_orphan_del(trans, inode);
+        BUG_ON(ret);
+out:
+        nr = trans->blocks_used;
+        ret = btrfs_end_transaction_throttle(trans, root);
+        BUG_ON(ret);
+        btrfs_btree_balance_dirty(root, nr);
+}
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *new_root, struct dentry *dentry,
+                             u64 new_dirid, u64 alloc_hint)
+{
+        struct inode *inode;
+        int error;
+        u64 index = 0;
+        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
+                                new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        inode->i_op = &btrfs_dir_inode_operations;
+        inode->i_fop = &btrfs_dir_file_operations;
+        inode->i_nlink = 1;
+        btrfs_i_size_write(inode, 0);
+        error = btrfs_update_inode(trans, new_root, inode);
+        if (error)
+                return error;
+        d_instantiate(dentry, inode);
+        return 0;
+}
+/* helper function for file defrag and space balancing.  This
+ * forces readahead on a given range of bytes in an inode
+ */
+unsigned long btrfs_force_ra(struct address_space *mapping,
+                              struct file_ra_state *ra, struct file *file,
+                              pgoff_t offset, pgoff_t last_index)
+{
+        pgoff_t req_size = last_index - offset + 1;
+        page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+        return offset + req_size;
+}
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+        struct btrfs_inode *ei;
+        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+        if (!ei)
+                return NULL;
+        ei->last_trans = 0;
+        ei->logged_trans = 0;
+        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+        ei->i_acl = BTRFS_ACL_NOT_CACHED;
+        ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
+        INIT_LIST_HEAD(&ei->i_orphan);
+        return &ei->vfs_inode;
+}
+void btrfs_destroy_inode(struct inode *inode)
+{
+        struct btrfs_ordered_extent *ordered;
+        WARN_ON(!list_empty(&inode->i_dentry));
+        WARN_ON(inode->i_data.nrpages);
+        if (BTRFS_I(inode)->i_acl &&
+            BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
+                posix_acl_release(BTRFS_I(inode)->i_acl);
+        if (BTRFS_I(inode)->i_default_acl &&
+            BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
+                posix_acl_release(BTRFS_I(inode)->i_default_acl);
+        spin_lock(&BTRFS_I(inode)->root->list_lock);
+        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+                printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
+                       " list\n", inode->i_ino);
+                dump_stack();
+        }
+        spin_unlock(&BTRFS_I(inode)->root->list_lock);
+        while (1) {
+                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+                if (!ordered)
+                        break;
+                else {
+                        printk(KERN_ERR "btrfs found ordered "
+                               "extent %llu %llu on inode cleanup\n",
+                               (unsigned long long)ordered->file_offset,
+                               (unsigned long long)ordered->len);
+                        btrfs_remove_ordered_extent(inode, ordered);
+                        btrfs_put_ordered_extent(ordered);
+                        btrfs_put_ordered_extent(ordered);
+                }
+        }
+        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+static void init_once(void *foo)
+{
+        struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+        inode_init_once(&ei->vfs_inode);
+}
+void btrfs_destroy_cachep(void)
+{
+        if (btrfs_inode_cachep)
+                kmem_cache_destroy(btrfs_inode_cachep);
+        if (btrfs_trans_handle_cachep)
+                kmem_cache_destroy(btrfs_trans_handle_cachep);
+        if (btrfs_transaction_cachep)
+                kmem_cache_destroy(btrfs_transaction_cachep);
+        if (btrfs_bit_radix_cachep)
+                kmem_cache_destroy(btrfs_bit_radix_cachep);
+        if (btrfs_path_cachep)
+                kmem_cache_destroy(btrfs_path_cachep);
+}
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+                                       unsigned long extra_flags,
+                                       void (*ctor)(void *))
+{
+        return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
+                                 SLAB_MEM_SPREAD | extra_flags), ctor);
+}
+int btrfs_init_cachep(void)
+{
+        btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
+                                          sizeof(struct btrfs_inode),
+                                          0, init_once);
+        if (!btrfs_inode_cachep)
+                goto fail;
+        btrfs_trans_handle_cachep =
+                        btrfs_cache_create("btrfs_trans_handle_cache",
+                                           sizeof(struct btrfs_trans_handle),
+                                           0, NULL);
+        if (!btrfs_trans_handle_cachep)
+                goto fail;
+        btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
+                                             sizeof(struct btrfs_transaction),
+                                             0, NULL);
+        if (!btrfs_transaction_cachep)
+                goto fail;
+        btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
+                                         sizeof(struct btrfs_path),
+                                         0, NULL);
+        if (!btrfs_path_cachep)
+                goto fail;
+        btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
+                                              SLAB_DESTROY_BY_RCU, NULL);
+        if (!btrfs_bit_radix_cachep)
+                goto fail;
+        return 0;
+fail:
+        btrfs_destroy_cachep();
+        return -ENOMEM;
+}
+static int btrfs_getattr(struct vfsmount *mnt,
+                         struct dentry *dentry, struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        generic_fillattr(inode, stat);
+        stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
+        stat->blksize = PAGE_CACHE_SIZE;
+        stat->blocks = (inode_get_bytes(inode) +
+                        BTRFS_I(inode)->delalloc_bytes) >> 9;
+        return 0;
+}
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                           struct inode *new_dir, struct dentry *new_dentry)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(old_dir)->root;
+        struct inode *new_inode = new_dentry->d_inode;
+        struct inode *old_inode = old_dentry->d_inode;
+        struct timespec ctime = CURRENT_TIME;
+        u64 index = 0;
+        int ret;
+        /* we're not allowed to rename between subvolumes */
+        if (BTRFS_I(old_inode)->root->root_key.objectid !=
+            BTRFS_I(new_dir)->root->root_key.objectid)
+                return -EXDEV;
+        if (S_ISDIR(old_inode->i_mode) && new_inode &&
+            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+                return -ENOTEMPTY;
+        }
+        /* to rename a snapshot or subvolume, we need to juggle the
+         * backrefs.  This isn't coded yet
+         */
+        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+                return -EXDEV;
+        ret = btrfs_check_free_space(root, 1, 0);
+        if (ret)
+                goto out_unlock;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, new_dir);
+        btrfs_inc_nlink(old_dentry->d_inode);
+        old_dir->i_ctime = old_dir->i_mtime = ctime;
+        new_dir->i_ctime = new_dir->i_mtime = ctime;
+        old_inode->i_ctime = ctime;
+        ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+                                 old_dentry->d_name.name,
+                                 old_dentry->d_name.len);
+        if (ret)
+                goto out_fail;
+        if (new_inode) {
+                new_inode->i_ctime = CURRENT_TIME;
+                ret = btrfs_unlink_inode(trans, root, new_dir,
+                                         new_dentry->d_inode,
+                                         new_dentry->d_name.name,
+                                         new_dentry->d_name.len);
+                if (ret)
+                        goto out_fail;
+                if (new_inode->i_nlink == 0) {
+                        ret = btrfs_orphan_add(trans, new_dentry->d_inode);
+                        if (ret)
+                                goto out_fail;
+                }
+        }
+        ret = btrfs_set_inode_index(new_dir, &index);
+        if (ret)
+                goto out_fail;
+        ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
+                             old_inode, new_dentry->d_name.name,
+                             new_dentry->d_name.len, 1, index);
+        if (ret)
+                goto out_fail;
+out_fail:
+        btrfs_end_transaction_throttle(trans, root);
+out_unlock:
+        return ret;
+}
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+{
+        struct list_head *head = &root->fs_info->delalloc_inodes;
+        struct btrfs_inode *binode;
+        struct inode *inode;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        spin_lock(&root->fs_info->delalloc_lock);
+        while (!list_empty(head)) {
+                binode = list_entry(head->next, struct btrfs_inode,
+                                    delalloc_inodes);
+                inode = igrab(&binode->vfs_inode);
+                if (!inode)
+                        list_del_init(&binode->delalloc_inodes);
+                spin_unlock(&root->fs_info->delalloc_lock);
+                if (inode) {
+                        filemap_flush(inode->i_mapping);
+                        iput(inode);
+                }
+                cond_resched();
+                spin_lock(&root->fs_info->delalloc_lock);
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
+        /* the filemap_flush will queue IO into the worker threads, but
+         * we have to make sure the IO is actually started and that
+         * ordered extents get created before we return
+         */
+        atomic_inc(&root->fs_info->async_submit_draining);
+        while (atomic_read(&root->fs_info->nr_async_submits) ||
+              atomic_read(&root->fs_info->async_delalloc_pages)) {
+                wait_event(root->fs_info->async_submit_wait,
+                   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
+        }
+        atomic_dec(&root->fs_info->async_submit_draining);
+        return 0;
+}
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+                         const char *symname)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct inode *inode = NULL;
+        int err;
+        int drop_inode = 0;
+        u64 objectid;
+        u64 index = 0 ;
+        int name_len;
+        int datasize;
+        unsigned long ptr;
+        struct btrfs_file_extent_item *ei;
+        struct extent_buffer *leaf;
+        unsigned long nr = 0;
+        name_len = strlen(symname) + 1;
+        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+                return -ENAMETOOLONG;
+        err = btrfs_check_free_space(root, 1, 0);
+        if (err)
+                goto out_fail;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+        if (err) {
+                err = -ENOSPC;
+                goto out_unlock;
+        }
+        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
+                                dentry->d_name.len,
+                                dentry->d_parent->d_inode->i_ino, objectid,
+                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+                                &index);
+        err = PTR_ERR(inode);
+        if (IS_ERR(inode))
+                goto out_unlock;
+        err = btrfs_init_acl(inode, dir);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
+        btrfs_set_trans_block_group(trans, inode);
+        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
+        if (err)
+                drop_inode = 1;
+        else {
+                inode->i_mapping->a_ops = &btrfs_aops;
+                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+                inode->i_fop = &btrfs_file_operations;
+                inode->i_op = &btrfs_file_inode_operations;
+                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+        }
+        dir->i_sb->s_dirt = 1;
+        btrfs_update_inode_block_group(trans, inode);
+        btrfs_update_inode_block_group(trans, dir);
+        if (drop_inode)
+                goto out_unlock;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = inode->i_ino;
+        key.offset = 0;
+        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+        datasize = btrfs_file_extent_calc_inline_size(name_len);
+        err = btrfs_insert_empty_item(trans, root, path, &key,
+                                      datasize);
+        if (err) {
+                drop_inode = 1;
+                goto out_unlock;
+        }
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+        btrfs_set_file_extent_type(leaf, ei,
+                                   BTRFS_FILE_EXTENT_INLINE);
+        btrfs_set_file_extent_encryption(leaf, ei, 0);
+        btrfs_set_file_extent_compression(leaf, ei, 0);
+        btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+        btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+        ptr = btrfs_file_extent_inline_start(ei);
+        write_extent_buffer(leaf, symname, ptr, name_len);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        inode->i_op = &btrfs_symlink_inode_operations;
+        inode->i_mapping->a_ops = &btrfs_symlink_aops;
+        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+        inode_set_bytes(inode, name_len);
+        btrfs_i_size_write(inode, name_len - 1);
+        err = btrfs_update_inode(trans, root, inode);
+        if (err)
+                drop_inode = 1;
+out_unlock:
+        nr = trans->blocks_used;
+        btrfs_end_transaction_throttle(trans, root);
+out_fail:
+        if (drop_inode) {
+                inode_dec_link_count(inode);
+                iput(inode);
+        }
+        btrfs_btree_balance_dirty(root, nr);
+        return err;
+}
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+                               u64 alloc_hint, int mode)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_key ins;
+        u64 alloc_size;
+        u64 cur_offset = start;
+        u64 num_bytes = end - start;
+        int ret = 0;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(!trans);
+        btrfs_set_trans_block_group(trans, inode);
+        while (num_bytes > 0) {
+                alloc_size = min(num_bytes, root->fs_info->max_extent);
+                ret = btrfs_reserve_extent(trans, root, alloc_size,
+                                           root->sectorsize, 0, alloc_hint,
+                                           (u64)-1, &ins, 1);
+                if (ret) {
+                        WARN_ON(1);
+                        goto out;
+                }
+                ret = insert_reserved_file_extent(trans, inode,
+                                                  cur_offset, ins.objectid,
+                                                  ins.offset, ins.offset,
+                                                  ins.offset, 0, 0, 0,
+                                                  BTRFS_FILE_EXTENT_PREALLOC);
+                BUG_ON(ret);
+                num_bytes -= ins.offset;
+                cur_offset += ins.offset;
+                alloc_hint = ins.objectid + ins.offset;
+        }
+out:
+        if (cur_offset > start) {
+                inode->i_ctime = CURRENT_TIME;
+                btrfs_set_flag(inode, PREALLOC);
+                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+                    cur_offset > i_size_read(inode))
+                        btrfs_i_size_write(inode, cur_offset);
+                ret = btrfs_update_inode(trans, root, inode);
+                BUG_ON(ret);
+        }
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
+static long btrfs_fallocate(struct inode *inode, int mode,
+                            loff_t offset, loff_t len)
+{
+        u64 cur_offset;
+        u64 last_byte;
+        u64 alloc_start;
+        u64 alloc_end;
+        u64 alloc_hint = 0;
+        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        struct extent_map *em;
+        int ret;
+        alloc_start = offset & ~mask;
+        alloc_end =  (offset + len + mask) & ~mask;
+        mutex_lock(&inode->i_mutex);
+        if (alloc_start > inode->i_size) {
+                ret = btrfs_cont_expand(inode, alloc_start);
+                if (ret)
+                        goto out;
+        }
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
+                            alloc_end - 1, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            alloc_end - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > alloc_start &&
+                    ordered->file_offset < alloc_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent(&BTRFS_I(inode)->io_tree,
+                                      alloc_start, alloc_end - 1, GFP_NOFS);
+                        btrfs_wait_ordered_range(inode, alloc_start,
+                                                 alloc_end - alloc_start);
+                } else {
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+        }
+        cur_offset = alloc_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                      alloc_end - cur_offset, 0);
+                BUG_ON(IS_ERR(em) || !em);
+                last_byte = min(extent_map_end(em), alloc_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (em->block_start == EXTENT_MAP_HOLE) {
+                        ret = prealloc_file_range(inode, cur_offset,
+                                        last_byte, alloc_hint, mode);
+                        if (ret < 0) {
+                                free_extent_map(em);
+                                break;
+                        }
+                }
+                if (em->block_start <= EXTENT_MAP_LAST_BYTE)
+                        alloc_hint = em->block_start;
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (cur_offset >= alloc_end) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+                      GFP_NOFS);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
+static int btrfs_set_page_dirty(struct page *page)
+{
+        return __set_page_dirty_nobuffers(page);
+}
+static int btrfs_permission(struct inode *inode, int mask)
+{
+        if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+                return -EACCES;
+        return generic_permission(inode, mask, btrfs_check_acl);
+}
+static struct inode_operations btrfs_dir_inode_operations = {
+        .getattr        = btrfs_getattr,
+        .lookup         = btrfs_lookup,
+        .create         = btrfs_create,
+        .unlink         = btrfs_unlink,
+        .link           = btrfs_link,
+        .mkdir          = btrfs_mkdir,
+        .rmdir          = btrfs_rmdir,
+        .rename         = btrfs_rename,
+        .symlink        = btrfs_symlink,
+        .setattr        = btrfs_setattr,
+        .mknod          = btrfs_mknod,
+        .setxattr       = btrfs_setxattr,
+        .getxattr       = btrfs_getxattr,
+        .listxattr      = btrfs_listxattr,
+        .removexattr    = btrfs_removexattr,
+        .permission     = btrfs_permission,
+};
+static struct inode_operations btrfs_dir_ro_inode_operations = {
+        .lookup         = btrfs_lookup,
+        .permission     = btrfs_permission,
+};
+static struct file_operations btrfs_dir_file_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .readdir        = btrfs_real_readdir,
+        .unlocked_ioctl = btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = btrfs_ioctl,
+#endif
+        .release        = btrfs_release_file,
+        .fsync          = btrfs_sync_file,
+};
+static struct extent_io_ops btrfs_extent_io_ops = {
+        .fill_delalloc = run_delalloc_range,
+        .submit_bio_hook = btrfs_submit_bio_hook,
+        .merge_bio_hook = btrfs_merge_bio_hook,
+        .readpage_end_io_hook = btrfs_readpage_end_io_hook,
+        .writepage_end_io_hook = btrfs_writepage_end_io_hook,
+        .writepage_start_hook = btrfs_writepage_start_hook,
+        .readpage_io_failed_hook = btrfs_io_failed_hook,
+        .set_bit_hook = btrfs_set_bit_hook,
+        .clear_bit_hook = btrfs_clear_bit_hook,
+};
+static struct address_space_operations btrfs_aops = {
+        .readpage       = btrfs_readpage,
+        .writepage      = btrfs_writepage,
+        .writepages     = btrfs_writepages,
+        .readpages      = btrfs_readpages,
+        .sync_page      = block_sync_page,
+        .bmap           = btrfs_bmap,
+        .direct_IO      = btrfs_direct_IO,
+        .invalidatepage = btrfs_invalidatepage,
+        .releasepage    = btrfs_releasepage,
+        .set_page_dirty = btrfs_set_page_dirty,
+};
+static struct address_space_operations btrfs_symlink_aops = {
+        .readpage       = btrfs_readpage,
+        .writepage      = btrfs_writepage,
+        .invalidatepage = btrfs_invalidatepage,
+        .releasepage    = btrfs_releasepage,
+};
+static struct inode_operations btrfs_file_inode_operations = {
+        .truncate       = btrfs_truncate,
+        .getattr        = btrfs_getattr,
+        .setattr        = btrfs_setattr,
+        .setxattr       = btrfs_setxattr,
+        .getxattr       = btrfs_getxattr,
+        .listxattr      = btrfs_listxattr,
+        .removexattr    = btrfs_removexattr,
+        .permission     = btrfs_permission,
+        .fallocate      = btrfs_fallocate,
+};
+static struct inode_operations btrfs_special_inode_operations = {
+        .getattr        = btrfs_getattr,
+        .setattr        = btrfs_setattr,
+        .permission     = btrfs_permission,
+        .setxattr       = btrfs_setxattr,
+        .getxattr       = btrfs_getxattr,
+        .listxattr      = btrfs_listxattr,
+        .removexattr    = btrfs_removexattr,
+};
+static struct inode_operations btrfs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .permission     = btrfs_permission,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 000000000000..c2aa33e3feb5
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fsnotify.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/security.h>
+#include <linux/version.h>
+#include <linux/xattr.h>
+#include <linux/vmalloc.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "locking.h"
+static noinline int create_subvol(struct btrfs_root *root,
+                                  struct dentry *dentry,
+                                  char *name, int namelen)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key key;
+        struct btrfs_root_item root_item;
+        struct btrfs_inode_item *inode_item;
+        struct extent_buffer *leaf;
+        struct btrfs_root *new_root = root;
+        struct inode *dir;
+        int ret;
+        int err;
+        u64 objectid;
+        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+        u64 index = 0;
+        unsigned long nr = 1;
+        ret = btrfs_check_free_space(root, 1, 0);
+        if (ret)
+                goto fail_commit;
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+                                       0, &objectid);
+        if (ret)
+                goto fail;
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+                                      objectid, trans->transid, 0, 0, 0);
+        if (IS_ERR(leaf)) {
+                ret = PTR_ERR(leaf);
+                goto fail;
+        }
+        btrfs_set_header_nritems(leaf, 0);
+        btrfs_set_header_level(leaf, 0);
+        btrfs_set_header_bytenr(leaf, leaf->start);
+        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_owner(leaf, objectid);
+        write_extent_buffer(leaf, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(leaf),
+                            BTRFS_FSID_SIZE);
+        btrfs_mark_buffer_dirty(leaf);
+        inode_item = &root_item.inode;
+        memset(inode_item, 0, sizeof(*inode_item));
+        inode_item->generation = cpu_to_le64(1);
+        inode_item->size = cpu_to_le64(3);
+        inode_item->nlink = cpu_to_le32(1);
+        inode_item->nbytes = cpu_to_le64(root->leafsize);
+        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+        btrfs_set_root_bytenr(&root_item, leaf->start);
+        btrfs_set_root_generation(&root_item, trans->transid);
+        btrfs_set_root_level(&root_item, 0);
+        btrfs_set_root_refs(&root_item, 1);
+        btrfs_set_root_used(&root_item, 0);
+        btrfs_set_root_last_snapshot(&root_item, 0);
+        memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+        root_item.drop_level = 0;
+        btrfs_tree_unlock(leaf);
+        free_extent_buffer(leaf);
+        leaf = NULL;
+        btrfs_set_root_dirid(&root_item, new_dirid);
+        key.objectid = objectid;
+        key.offset = 1;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+                                &root_item);
+        if (ret)
+                goto fail;
+        /*
+         * insert the directory item
+         */
+        key.offset = (u64)-1;
+        dir = dentry->d_parent->d_inode;
+        ret = btrfs_set_inode_index(dir, &index);
+        BUG_ON(ret);
+        ret = btrfs_insert_dir_item(trans, root,
+                                    name, namelen, dir->i_ino, &key,
+                                    BTRFS_FT_DIR, index);
+        if (ret)
+                goto fail;
+        btrfs_i_size_write(dir, dir->i_size + namelen * 2);
+        ret = btrfs_update_inode(trans, root, dir);
+        BUG_ON(ret);
+        /* add the backref first */
+        ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+                                 objectid, BTRFS_ROOT_BACKREF_KEY,
+                                 root->root_key.objectid,
+                                 dir->i_ino, index, name, namelen);
+        BUG_ON(ret);
+        /* now add the forward ref */
+        ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+                                 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
+                                 objectid,
+                                 dir->i_ino, index, name, namelen);
+        BUG_ON(ret);
+        ret = btrfs_commit_transaction(trans, root);
+        if (ret)
+                goto fail_commit;
+        new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+        BUG_ON(!new_root);
+        trans = btrfs_start_transaction(new_root, 1);
+        BUG_ON(!trans);
+        ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
+                                       BTRFS_I(dir)->block_group);
+        if (ret)
+                goto fail;
+fail:
+        nr = trans->blocks_used;
+        err = btrfs_commit_transaction(trans, new_root);
+        if (err && !ret)
+                ret = err;
+fail_commit:
+        btrfs_btree_balance_dirty(root, nr);
+        return ret;
+}
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+                           char *name, int namelen)
+{
+        struct btrfs_pending_snapshot *pending_snapshot;
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        int err;
+        unsigned long nr = 0;
+        if (!root->ref_cows)
+                return -EINVAL;
+        ret = btrfs_check_free_space(root, 1, 0);
+        if (ret)
+                goto fail_unlock;
+        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+        if (!pending_snapshot) {
+                ret = -ENOMEM;
+                goto fail_unlock;
+        }
+        pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+        if (!pending_snapshot->name) {
+                ret = -ENOMEM;
+                kfree(pending_snapshot);
+                goto fail_unlock;
+        }
+        memcpy(pending_snapshot->name, name, namelen);
+        pending_snapshot->name[namelen] = '\0';
+        pending_snapshot->dentry = dentry;
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        pending_snapshot->root = root;
+        list_add(&pending_snapshot->list,
+                 &trans->transaction->pending_snapshots);
+        err = btrfs_commit_transaction(trans, root);
+fail_unlock:
+        btrfs_btree_balance_dirty(root, nr);
+        return ret;
+}
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+{
+        if (child->d_inode)
+                return -EEXIST;
+        if (IS_DEADDIR(dir))
+                return -ENOENT;
+        return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+/*
+ * Create a new subvolume below @parent.  This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(struct path *parent, char *name,
+                                   int mode, int namelen,
+                                   struct btrfs_root *snap_src)
+{
+        struct dentry *dentry;
+        int error;
+        mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+        dentry = lookup_one_len(name, parent->dentry, namelen);
+        error = PTR_ERR(dentry);
+        if (IS_ERR(dentry))
+                goto out_unlock;
+        error = -EEXIST;
+        if (dentry->d_inode)
+                goto out_dput;
+        if (!IS_POSIXACL(parent->dentry->d_inode))
+                mode &= ~current->fs->umask;
+        error = mnt_want_write(parent->mnt);
+        if (error)
+                goto out_dput;
+        error = btrfs_may_create(parent->dentry->d_inode, dentry);
+        if (error)
+                goto out_drop_write;
+        /*
+         * Actually perform the low-level subvolume creation after all
+         * this VFS fuzz.
+         *
+         * Eventually we want to pass in an inode under which we create this
+         * subvolume, but for now all are under the filesystem root.
+         *
+         * Also we should pass on the mode eventually to allow creating new
+         * subvolume with specific mode bits.
+         */
+        if (snap_src) {
+                struct dentry *dir = dentry->d_parent;
+                struct dentry *test = dir->d_parent;
+                struct btrfs_path *path = btrfs_alloc_path();
+                int ret;
+                u64 test_oid;
+                u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
+                test_oid = snap_src->root_key.objectid;
+                ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+                                          path, parent_oid, test_oid);
+                if (ret == 0)
+                        goto create;
+                btrfs_release_path(snap_src->fs_info->tree_root, path);
+                /* we need to make sure we aren't creating a directory loop
+                 * by taking a snapshot of something that has our current
+                 * subvol in its directory tree.  So, this loops through
+                 * the dentries and checks the forward refs for each subvolume
+                 * to see if is references the subvolume where we are
+                 * placing this new snapshot.
+                 */
+                while (1) {
+                        if (!test ||
+                            dir == snap_src->fs_info->sb->s_root ||
+                            test == snap_src->fs_info->sb->s_root ||
+                            test->d_inode->i_sb != snap_src->fs_info->sb) {
+                                break;
+                        }
+                        if (S_ISLNK(test->d_inode->i_mode)) {
+                                printk(KERN_INFO "Btrfs symlink in snapshot "
+                                       "path, failed\n");
+                                error = -EMLINK;
+                                btrfs_free_path(path);
+                                goto out_drop_write;
+                        }
+                        test_oid =
+                                BTRFS_I(test->d_inode)->root->root_key.objectid;
+                        ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+                                  path, test_oid, parent_oid);
+                        if (ret == 0) {
+                                printk(KERN_INFO "Btrfs snapshot creation "
+                                       "failed, looping\n");
+                                error = -EMLINK;
+                                btrfs_free_path(path);
+                                goto out_drop_write;
+                        }
+                        btrfs_release_path(snap_src->fs_info->tree_root, path);
+                        test = test->d_parent;
+                }
+create:
+                btrfs_free_path(path);
+                error = create_snapshot(snap_src, dentry, name, namelen);
+        } else {
+                error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
+                                      dentry, name, namelen);
+        }
+        if (error)
+                goto out_drop_write;
+        fsnotify_mkdir(parent->dentry->d_inode, dentry);
+out_drop_write:
+        mnt_drop_write(parent->mnt);
+out_dput:
+        dput(dentry);
+out_unlock:
+        mutex_unlock(&parent->dentry->d_inode->i_mutex);
+        return error;
+}
+static int btrfs_defrag_file(struct file *file)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_ordered_extent *ordered;
+        struct page *page;
+        unsigned long last_index;
+        unsigned long ra_pages = root->fs_info->bdi.ra_pages;
+        unsigned long total_read = 0;
+        u64 page_start;
+        u64 page_end;
+        unsigned long i;
+        int ret;
+        ret = btrfs_check_free_space(root, inode->i_size, 0);
+        if (ret)
+                return -ENOSPC;
+        mutex_lock(&inode->i_mutex);
+        last_index = inode->i_size >> PAGE_CACHE_SHIFT;
+        for (i = 0; i <= last_index; i++) {
+                if (total_read % ra_pages == 0) {
+                        btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
+                                       min(last_index, i + ra_pages - 1));
+                }
+                total_read++;
+again:
+                page = grab_cache_page(inode->i_mapping, i);
+                if (!page)
+                        goto out_unlock;
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto out_unlock;
+                        }
+                }
+                wait_on_page_writeback(page);
+                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+                page_end = page_start + PAGE_CACHE_SIZE - 1;
+                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, page_start);
+                if (ordered) {
+                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                        goto again;
+                }
+                set_page_extent_mapped(page);
+                /*
+                 * this makes sure page_mkwrite is called on the
+                 * page if it is dirtied again later
+                 */
+                clear_page_dirty_for_io(page);
+                btrfs_set_extent_delalloc(inode, page_start, page_end);
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                set_page_dirty(page);
+                unlock_page(page);
+                page_cache_release(page);
+                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+        }
+out_unlock:
+        mutex_unlock(&inode->i_mutex);
+        return 0;
+}
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+{
+        u64 new_size;
+        u64 old_size;
+        u64 devid = 1;
+        struct btrfs_ioctl_vol_args *vol_args;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_device *device = NULL;
+        char *sizestr;
+        char *devstr = NULL;
+        int ret = 0;
+        int namelen;
+        int mod = 0;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        if (!vol_args)
+                return -ENOMEM;
+        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        namelen = strlen(vol_args->name);
+        mutex_lock(&root->fs_info->volume_mutex);
+        sizestr = vol_args->name;
+        devstr = strchr(sizestr, ':');
+        if (devstr) {
+                char *end;
+                sizestr = devstr + 1;
+                *devstr = '\0';
+                devstr = vol_args->name;
+                devid = simple_strtoull(devstr, &end, 10);
+                printk(KERN_INFO "resizing devid %llu\n", devid);
+        }
+        device = btrfs_find_device(root, devid, NULL, NULL);
+        if (!device) {
+                printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        if (!strcmp(sizestr, "max"))
+                new_size = device->bdev->bd_inode->i_size;
+        else {
+                if (sizestr[0] == '-') {
+                        mod = -1;
+                        sizestr++;
+                } else if (sizestr[0] == '+') {
+                        mod = 1;
+                        sizestr++;
+                }
+                new_size = btrfs_parse_size(sizestr);
+                if (new_size == 0) {
+                        ret = -EINVAL;
+                        goto out_unlock;
+                }
+        }
+        old_size = device->total_bytes;
+        if (mod < 0) {
+                if (new_size > old_size) {
+                        ret = -EINVAL;
+                        goto out_unlock;
+                }
+                new_size = old_size - new_size;
+        } else if (mod > 0) {
+                new_size = old_size + new_size;
+        }
+        if (new_size < 256 * 1024 * 1024) {
+                ret = -EINVAL;
+                goto out_unlock;
+        }
+        if (new_size > device->bdev->bd_inode->i_size) {
+                ret = -EFBIG;
+                goto out_unlock;
+        }
+        do_div(new_size, root->sectorsize);
+        new_size *= root->sectorsize;
+        printk(KERN_INFO "new size for %s is %llu\n",
+                device->name, (unsigned long long)new_size);
+        if (new_size > old_size) {
+                trans = btrfs_start_transaction(root, 1);
+                ret = btrfs_grow_device(trans, device, new_size);
+                btrfs_commit_transaction(trans, root);
+        } else {
+                ret = btrfs_shrink_device(device, new_size);
+        }
+out_unlock:
+        mutex_unlock(&root->fs_info->volume_mutex);
+out:
+        kfree(vol_args);
+        return ret;
+}
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+                                            void __user *arg, int subvol)
+{
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        struct btrfs_ioctl_vol_args *vol_args;
+        struct btrfs_dir_item *di;
+        struct btrfs_path *path;
+        struct file *src_file;
+        u64 root_dirid;
+        int namelen;
+        int ret = 0;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        if (!vol_args)
+                return -ENOMEM;
+        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        namelen = strlen(vol_args->name);
+        if (strchr(vol_args->name, '/')) {
+                ret = -EINVAL;
+                goto out;
+        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+        di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+                            path, root_dirid,
+                            vol_args->name, namelen, 0);
+        btrfs_free_path(path);
+        if (di && !IS_ERR(di)) {
+                ret = -EEXIST;
+                goto out;
+        }
+        if (IS_ERR(di)) {
+                ret = PTR_ERR(di);
+                goto out;
+        }
+        if (subvol) {
+                ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+                                     file->f_path.dentry->d_inode->i_mode,
+                                     namelen, NULL);
+        } else {
+                struct inode *src_inode;
+                src_file = fget(vol_args->fd);
+                if (!src_file) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                src_inode = src_file->f_path.dentry->d_inode;
+                if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
+                        printk(KERN_INFO "btrfs: Snapshot src from "
+                               "another FS\n");
+                        ret = -EINVAL;
+                        fput(src_file);
+                        goto out;
+                }
+                ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+                             file->f_path.dentry->d_inode->i_mode,
+                             namelen, BTRFS_I(src_inode)->root);
+                fput(src_file);
+        }
+out:
+        kfree(vol_args);
+        return ret;
+}
+static int btrfs_ioctl_defrag(struct file *file)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        ret = mnt_want_write(file->f_path.mnt);
+        if (ret)
+                return ret;
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFDIR:
+                if (!capable(CAP_SYS_ADMIN)) {
+                        ret = -EPERM;
+                        goto out;
+                }
+                btrfs_defrag_root(root, 0);
+                btrfs_defrag_root(root->fs_info->extent_root, 0);
+                break;
+        case S_IFREG:
+                if (!(file->f_mode & FMODE_WRITE)) {
+                        ret = -EINVAL;
+                        goto out;
+                }
+                btrfs_defrag_file(file);
+                break;
+        }
+out:
+        mnt_drop_write(file->f_path.mnt);
+        return ret;
+}
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_vol_args *vol_args;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        if (!vol_args)
+                return -ENOMEM;
+        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        ret = btrfs_init_new_device(root, vol_args->name);
+out:
+        kfree(vol_args);
+        return ret;
+}
+static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_vol_args *vol_args;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+        if (!vol_args)
+                return -ENOMEM;
+        if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        ret = btrfs_rm_device(root, vol_args->name);
+out:
+        kfree(vol_args);
+        return ret;
+}
+static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+                u64 off, u64 olen, u64 destoff)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct file *src_file;
+        struct inode *src;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        char *buf;
+        struct btrfs_key key;
+        u32 nritems;
+        int slot;
+        int ret;
+        u64 len = olen;
+        u64 bs = root->fs_info->sb->s_blocksize;
+        u64 hint_byte;
+        /*
+         * TODO:
+         * - split compressed inline extents.  annoying: we need to
+         *   decompress into destination's address_space (the file offset
+         *   may change, so source mapping won't do), then recompress (or
+         *   otherwise reinsert) a subrange.
+         * - allow ranges within the same file to be cloned (provided
+         *   they don't overlap)?
+         */
+        /* the destination must be opened for writing */
+        if (!(file->f_mode & FMODE_WRITE))
+                return -EINVAL;
+        ret = mnt_want_write(file->f_path.mnt);
+        if (ret)
+                return ret;
+        src_file = fget(srcfd);
+        if (!src_file) {
+                ret = -EBADF;
+                goto out_drop_write;
+        }
+        src = src_file->f_dentry->d_inode;
+        ret = -EINVAL;
+        if (src == inode)
+                goto out_fput;
+        ret = -EISDIR;
+        if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+                goto out_fput;
+        ret = -EXDEV;
+        if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
+                goto out_fput;
+        ret = -ENOMEM;
+        buf = vmalloc(btrfs_level_size(root, 0));
+        if (!buf)
+                goto out_fput;
+        path = btrfs_alloc_path();
+        if (!path) {
+                vfree(buf);
+                goto out_fput;
+        }
+        path->reada = 2;
+        if (inode < src) {
+                mutex_lock(&inode->i_mutex);
+                mutex_lock(&src->i_mutex);
+        } else {
+                mutex_lock(&src->i_mutex);
+                mutex_lock(&inode->i_mutex);
+        }
+        /* determine range to clone */
+        ret = -EINVAL;
+        if (off >= src->i_size || off + len > src->i_size)
+                goto out_unlock;
+        if (len == 0)
+                olen = len = src->i_size - off;
+        /* if we extend to eof, continue to block boundary */
+        if (off + len == src->i_size)
+                len = ((src->i_size + bs-1) & ~(bs-1))
+                        - off;
+        /* verify the end result is block aligned */
+        if ((off & (bs-1)) ||
+            ((off + len) & (bs-1)))
+                goto out_unlock;
+        /* do any pending delalloc/csum calc on src, one way or
+           another, and lock file content */
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
+                if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+                        break;
+                unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+                if (ordered)
+                        btrfs_put_ordered_extent(ordered);
+                btrfs_wait_ordered_range(src, off, off+len);
+        }
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        /* punch hole in destination first */
+        btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+        /* clone data */
+        key.objectid = src->i_ino;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = 0;
+        while (1) {
+                /*
+                 * note the key will change type as we walk through the
+                 * tree.
+                 */
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                if (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0)
+                                break;
+                        nritems = btrfs_header_nritems(path->nodes[0]);
+                }
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+                    key.objectid != src->i_ino)
+                        break;
+                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+                        struct btrfs_file_extent_item *extent;
+                        int type;
+                        u32 size;
+                        struct btrfs_key new_key;
+                        u64 disko = 0, diskl = 0;
+                        u64 datao = 0, datal = 0;
+                        u8 comp;
+                        size = btrfs_item_size_nr(leaf, slot);
+                        read_extent_buffer(leaf, buf,
+                                           btrfs_item_ptr_offset(leaf, slot),
+                                           size);
+                        extent = btrfs_item_ptr(leaf, slot,
+                                                struct btrfs_file_extent_item);
+                        comp = btrfs_file_extent_compression(leaf, extent);
+                        type = btrfs_file_extent_type(leaf, extent);
+                        if (type == BTRFS_FILE_EXTENT_REG) {
+                                disko = btrfs_file_extent_disk_bytenr(leaf,
+                                                                      extent);
+                                diskl = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                 extent);
+                                datao = btrfs_file_extent_offset(leaf, extent);
+                                datal = btrfs_file_extent_num_bytes(leaf,
+                                                                    extent);
+                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+                                /* take upper bound, may be compressed */
+                                datal = btrfs_file_extent_ram_bytes(leaf,
+                                                                    extent);
+                        }
+                        btrfs_release_path(root, path);
+                        if (key.offset + datal < off ||
+                            key.offset >= off+len)
+                                goto next;
+                        memcpy(&new_key, &key, sizeof(new_key));
+                        new_key.objectid = inode->i_ino;
+                        new_key.offset = key.offset + destoff - off;
+                        if (type == BTRFS_FILE_EXTENT_REG) {
+                                ret = btrfs_insert_empty_item(trans, root, path,
+                                                              &new_key, size);
+                                if (ret)
+                                        goto out;
+                                leaf = path->nodes[0];
+                                slot = path->slots[0];
+                                write_extent_buffer(leaf, buf,
+                                            btrfs_item_ptr_offset(leaf, slot),
+                                            size);
+                                extent = btrfs_item_ptr(leaf, slot,
+                                                struct btrfs_file_extent_item);
+                                if (off > key.offset) {
+                                        datao += off - key.offset;
+                                        datal -= off - key.offset;
+                                }
+                                if (key.offset + datao + datal + key.offset >
+                                    off + len)
+                                        datal = off + len - key.offset - datao;
+                                /* disko == 0 means it's a hole */
+                                if (!disko)
+                                        datao = 0;
+                                btrfs_set_file_extent_offset(leaf, extent,
+                                                             datao);
+                                btrfs_set_file_extent_num_bytes(leaf, extent,
+                                                                datal);
+                                if (disko) {
+                                        inode_add_bytes(inode, datal);
+                                        ret = btrfs_inc_extent_ref(trans, root,
+                                                   disko, diskl, leaf->start,
+                                                   root->root_key.objectid,
+                                                   trans->transid,
+                                                   inode->i_ino);
+                                        BUG_ON(ret);
+                                }
+                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+                                u64 skip = 0;
+                                u64 trim = 0;
+                                if (off > key.offset) {
+                                        skip = off - key.offset;
+                                        new_key.offset += skip;
+                                }
+                                if (key.offset + datal > off+len)
+                                        trim = key.offset + datal - (off+len);
+                                if (comp && (skip || trim)) {
+                                        ret = -EINVAL;
+                                        goto out;
+                                }
+                                size -= skip + trim;
+                                datal -= skip + trim;
+                                ret = btrfs_insert_empty_item(trans, root, path,
+                                                              &new_key, size);
+                                if (ret)
+                                        goto out;
+                                if (skip) {
+                                        u32 start =
+                                          btrfs_file_extent_calc_inline_size(0);
+                                        memmove(buf+start, buf+start+skip,
+                                                datal);
+                                }
+                                leaf = path->nodes[0];
+                                slot = path->slots[0];
+                                write_extent_buffer(leaf, buf,
+                                            btrfs_item_ptr_offset(leaf, slot),
+                                            size);
+                                inode_add_bytes(inode, datal);
+                        }
+                        btrfs_mark_buffer_dirty(leaf);
+                }
+next:
+                btrfs_release_path(root, path);
+                key.offset++;
+        }
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        if (ret == 0) {
+                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                if (destoff + olen > inode->i_size)
+                        btrfs_i_size_write(inode, destoff + olen);
+                BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+                ret = btrfs_update_inode(trans, root, inode);
+        }
+        btrfs_end_transaction(trans, root);
+        unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+        if (ret)
+                vmtruncate(inode, 0);
+out_unlock:
+        mutex_unlock(&src->i_mutex);
+        mutex_unlock(&inode->i_mutex);
+        vfree(buf);
+        btrfs_free_path(path);
+out_fput:
+        fput(src_file);
+out_drop_write:
+        mnt_drop_write(file->f_path.mnt);
+        return ret;
+}
+static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+{
+        struct btrfs_ioctl_clone_range_args args;
+        if (copy_from_user(&args, argp, sizeof(args)))
+                return -EFAULT;
+        return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
+                                 args.src_length, args.dest_offset);
+}
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+static long btrfs_ioctl_trans_start(struct file *file)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (file->private_data) {
+                ret = -EINPROGRESS;
+                goto out;
+        }
+        ret = mnt_want_write(file->f_path.mnt);
+        if (ret)
+                goto out;
+        mutex_lock(&root->fs_info->trans_mutex);
+        root->fs_info->open_ioctl_trans++;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        trans = btrfs_start_ioctl_transaction(root, 0);
+        if (trans)
+                file->private_data = trans;
+        else
+                ret = -ENOMEM;
+        /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+out:
+        return ret;
+}
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        int ret = 0;
+        trans = file->private_data;
+        if (!trans) {
+                ret = -EINVAL;
+                goto out;
+        }
+        btrfs_end_transaction(trans, root);
+        file->private_data = NULL;
+        mutex_lock(&root->fs_info->trans_mutex);
+        root->fs_info->open_ioctl_trans--;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        mnt_drop_write(file->f_path.mnt);
+out:
+        return ret;
+}
+long btrfs_ioctl(struct file *file, unsigned int
+                cmd, unsigned long arg)
+{
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        void __user *argp = (void __user *)arg;
+        switch (cmd) {
+        case BTRFS_IOC_SNAP_CREATE:
+                return btrfs_ioctl_snap_create(file, argp, 0);
+        case BTRFS_IOC_SUBVOL_CREATE:
+                return btrfs_ioctl_snap_create(file, argp, 1);
+        case BTRFS_IOC_DEFRAG:
+                return btrfs_ioctl_defrag(file);
+        case BTRFS_IOC_RESIZE:
+                return btrfs_ioctl_resize(root, argp);
+        case BTRFS_IOC_ADD_DEV:
+                return btrfs_ioctl_add_dev(root, argp);
+        case BTRFS_IOC_RM_DEV:
+                return btrfs_ioctl_rm_dev(root, argp);
+        case BTRFS_IOC_BALANCE:
+                return btrfs_balance(root->fs_info->dev_root);
+        case BTRFS_IOC_CLONE:
+                return btrfs_ioctl_clone(file, arg, 0, 0, 0);
+        case BTRFS_IOC_CLONE_RANGE:
+                return btrfs_ioctl_clone_range(file, argp);
+        case BTRFS_IOC_TRANS_START:
+                return btrfs_ioctl_trans_start(file);
+        case BTRFS_IOC_TRANS_END:
+                return btrfs_ioctl_trans_end(file);
+        case BTRFS_IOC_SYNC:
+                btrfs_sync_fs(file->f_dentry->d_sb, 1);
+                return 0;
+        }
+        return -ENOTTY;
+}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 000000000000..78049ea208db
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __IOCTL_
+#define __IOCTL_
+#include <linux/ioctl.h>
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+#define BTRFS_PATH_NAME_MAX 3072
+struct btrfs_ioctl_vol_args {
+        __s64 fd;
+        char name[BTRFS_PATH_NAME_MAX + 1];
+};
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+                                   struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+                                   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+                                   struct btrfs_ioctl_vol_args)
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+                                  struct btrfs_ioctl_clone_range_args)
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+                                   struct btrfs_ioctl_vol_args)
+#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 000000000000..39bae7761db6
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <asm/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+/*
+ * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
+ * and the spin is not tuned very extensively.  The spinning does make a big
+ * difference in almost every workload, but spinning for the right amount of
+ * time needs some help.
+ *
+ * In general, we want to spin as long as the lock holder is doing btree
+ * searches, and we should give up if they are in more expensive code.
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+        int i;
+        if (mutex_trylock(&eb->mutex))
+                return 0;
+        for (i = 0; i < 512; i++) {
+                cpu_relax();
+                if (mutex_trylock(&eb->mutex))
+                        return 0;
+        }
+        cpu_relax();
+        mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
+        return 0;
+}
+int btrfs_try_tree_lock(struct extent_buffer *eb)
+{
+        return mutex_trylock(&eb->mutex);
+}
+int btrfs_tree_unlock(struct extent_buffer *eb)
+{
+        mutex_unlock(&eb->mutex);
+        return 0;
+}
+int btrfs_tree_locked(struct extent_buffer *eb)
+{
+        return mutex_is_locked(&eb->mutex);
+}
+/*
+ * btrfs_search_slot uses this to decide if it should drop its locks
+ * before doing something expensive like allocating free blocks for cow.
+ */
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
+{
+        int i;
+        struct extent_buffer *eb;
+        for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
+                eb = path->nodes[i];
+                if (!eb)
+                        break;
+                smp_mb();
+                if (!list_empty(&eb->mutex.wait_list))
+                        return 1;
+        }
+        return 0;
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 000000000000..bc1faef12519
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+int btrfs_tree_lock(struct extent_buffer *eb);
+int btrfs_tree_unlock(struct extent_buffer *eb);
+int btrfs_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 000000000000..a20940170274
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,730 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "extent_io.h"
+static u64 entry_end(struct btrfs_ordered_extent *entry)
+{
+        if (entry->file_offset + entry->len < entry->file_offset)
+                return (u64)-1;
+        return entry->file_offset + entry->len;
+}
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct btrfs_ordered_extent *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
+                if (file_offset < entry->file_offset)
+                        p = &(*p)->rb_left;
+                else if (file_offset >= entry_end(entry))
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+                                     struct rb_node **prev_ret)
+{
+        struct rb_node *n = root->rb_node;
+        struct rb_node *prev = NULL;
+        struct rb_node *test;
+        struct btrfs_ordered_extent *entry;
+        struct btrfs_ordered_extent *prev_entry = NULL;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+                prev = n;
+                prev_entry = entry;
+                if (file_offset < entry->file_offset)
+                        n = n->rb_left;
+                else if (file_offset >= entry_end(entry))
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        if (!prev_ret)
+                return NULL;
+        while (prev && file_offset >= entry_end(prev_entry)) {
+                test = rb_next(prev);
+                if (!test)
+                        break;
+                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+                                      rb_node);
+                if (file_offset < entry_end(prev_entry))
+                        break;
+                prev = test;
+        }
+        if (prev)
+                prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+                                      rb_node);
+        while (prev && file_offset < entry_end(prev_entry)) {
+                test = rb_prev(prev);
+                if (!test)
+                        break;
+                prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+                                      rb_node);
+                prev = test;
+        }
+        *prev_ret = prev;
+        return NULL;
+}
+/*
+ * helper to check if a given offset is inside a given entry
+ */
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+        if (file_offset < entry->file_offset ||
+            entry->file_offset + entry->len <= file_offset)
+                return 0;
+        return 1;
+}
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+                                          u64 file_offset)
+{
+        struct rb_root *root = &tree->tree;
+        struct rb_node *prev;
+        struct rb_node *ret;
+        struct btrfs_ordered_extent *entry;
+        if (tree->last) {
+                entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+                                 rb_node);
+                if (offset_in_entry(entry, file_offset))
+                        return tree->last;
+        }
+        ret = __tree_search(root, file_offset, &prev);
+        if (!ret)
+                ret = prev;
+        if (ret)
+                tree->last = ret;
+        return ret;
+}
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * file_offset is the logical offset in the file
+ *
+ * start is the disk block number of an extent already reserved in the
+ * extent allocation tree
+ *
+ * len is the length of the extent
+ *
+ * This also sets the EXTENT_ORDERED bit on the range in the inode.
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+                             u64 start, u64 len, u64 disk_len, int type)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        entry = kzalloc(sizeof(*entry), GFP_NOFS);
+        if (!entry)
+                return -ENOMEM;
+        mutex_lock(&tree->mutex);
+        entry->file_offset = file_offset;
+        entry->start = start;
+        entry->len = len;
+        entry->disk_len = disk_len;
+        entry->inode = inode;
+        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
+                set_bit(type, &entry->flags);
+        /* one ref for the tree */
+        atomic_set(&entry->refs, 1);
+        init_waitqueue_head(&entry->wait);
+        INIT_LIST_HEAD(&entry->list);
+        INIT_LIST_HEAD(&entry->root_extent_list);
+        node = tree_insert(&tree->tree, file_offset,
+                           &entry->rb_node);
+        BUG_ON(node);
+        set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+                           entry_end(entry) - 1, GFP_NOFS);
+        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        list_add_tail(&entry->root_extent_list,
+                      &BTRFS_I(inode)->root->fs_info->ordered_extents);
+        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        mutex_unlock(&tree->mutex);
+        BUG_ON(node);
+        return 0;
+}
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished.  If the list covers more than one
+ * ordered extent, it is split across multiples.
+ */
+int btrfs_add_ordered_sum(struct inode *inode,
+                          struct btrfs_ordered_extent *entry,
+                          struct btrfs_ordered_sum *sum)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        list_add_tail(&sum->list, &entry->list);
+        mutex_unlock(&tree->mutex);
+        return 0;
+}
+/*
+ * this is used to account for finished IO across a given range
+ * of the file.  The IO should not span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ */
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+                                   u64 file_offset, u64 io_size)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int ret;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+                             GFP_NOFS);
+        node = tree_search(tree, file_offset);
+        if (!node) {
+                ret = 1;
+                goto out;
+        }
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        if (!offset_in_entry(entry, file_offset)) {
+                ret = 1;
+                goto out;
+        }
+        ret = test_range_bit(io_tree, entry->file_offset,
+                             entry->file_offset + entry->len - 1,
+                             EXTENT_ORDERED, 0);
+        if (ret == 0)
+                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+out:
+        mutex_unlock(&tree->mutex);
+        return ret == 0;
+}
+/*
+ * used to drop a reference on an ordered extent.  This will free
+ * the extent if the last reference is dropped
+ */
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+        struct list_head *cur;
+        struct btrfs_ordered_sum *sum;
+        if (atomic_dec_and_test(&entry->refs)) {
+                while (!list_empty(&entry->list)) {
+                        cur = entry->list.next;
+                        sum = list_entry(cur, struct btrfs_ordered_sum, list);
+                        list_del(&sum->list);
+                        kfree(sum);
+                }
+                kfree(entry);
+        }
+        return 0;
+}
+/*
+ * remove an ordered extent from the tree.  No references are dropped
+ * but, anyone waiting on this extent is woken up.
+ */
+int btrfs_remove_ordered_extent(struct inode *inode,
+                                struct btrfs_ordered_extent *entry)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        node = &entry->rb_node;
+        rb_erase(node, &tree->tree);
+        tree->last = NULL;
+        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        list_del_init(&entry->root_extent_list);
+        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        mutex_unlock(&tree->mutex);
+        wake_up(&entry->wait);
+        return 0;
+}
+/*
+ * wait for all the ordered extents in a root.  This is done when balancing
+ * space between drives.
+ */
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
+{
+        struct list_head splice;
+        struct list_head *cur;
+        struct btrfs_ordered_extent *ordered;
+        struct inode *inode;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        list_splice_init(&root->fs_info->ordered_extents, &splice);
+        while (!list_empty(&splice)) {
+                cur = splice.next;
+                ordered = list_entry(cur, struct btrfs_ordered_extent,
+                                     root_extent_list);
+                if (nocow_only &&
+                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
+                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+                        list_move(&ordered->root_extent_list,
+                                  &root->fs_info->ordered_extents);
+                        cond_resched_lock(&root->fs_info->ordered_extent_lock);
+                        continue;
+                }
+                list_del_init(&ordered->root_extent_list);
+                atomic_inc(&ordered->refs);
+                /*
+                 * the inode may be getting freed (in sys_unlink path).
+                 */
+                inode = igrab(ordered->inode);
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+                if (inode) {
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                        iput(inode);
+                } else {
+                        btrfs_put_ordered_extent(ordered);
+                }
+                spin_lock(&root->fs_info->ordered_extent_lock);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        return 0;
+}
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct inode *inode,
+                                       struct btrfs_ordered_extent *entry,
+                                       int wait)
+{
+        u64 start = entry->file_offset;
+        u64 end = start + entry->len - 1;
+        /*
+         * pages in the range can be dirty, clean or writeback.  We
+         * start IO on any dirty ones so the wait doesn't stall waiting
+         * for pdflush to find them
+         */
+        btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
+        if (wait) {
+                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+                                                 &entry->flags));
+        }
+}
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+        u64 end;
+        u64 orig_end;
+        u64 wait_end;
+        struct btrfs_ordered_extent *ordered;
+        if (start + len < start) {
+                orig_end = INT_LIMIT(loff_t);
+        } else {
+                orig_end = start + len - 1;
+                if (orig_end > INT_LIMIT(loff_t))
+                        orig_end = INT_LIMIT(loff_t);
+        }
+        wait_end = orig_end;
+again:
+        /* start IO across the range first to instantiate any delalloc
+         * extents
+         */
+        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+        /* The compression code will leave pages locked but return from
+         * writepage without setting the page writeback.  Starting again
+         * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+         */
+        btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+        btrfs_wait_on_page_writeback_range(inode->i_mapping,
+                                           start >> PAGE_CACHE_SHIFT,
+                                           orig_end >> PAGE_CACHE_SHIFT);
+        end = orig_end;
+        while (1) {
+                ordered = btrfs_lookup_first_ordered_extent(inode, end);
+                if (!ordered)
+                        break;
+                if (ordered->file_offset > orig_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+                if (ordered->file_offset + ordered->len < start) {
+                        btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                end = ordered->file_offset;
+                btrfs_put_ordered_extent(ordered);
+                if (end == 0 || end == start)
+                        break;
+                end--;
+        }
+        if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+                           EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+                schedule_timeout(1);
+                goto again;
+        }
+        return 0;
+}
+/*
+ * find an ordered extent corresponding to file_offset.  return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+                                                         u64 file_offset)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        node = tree_search(tree, file_offset);
+        if (!node)
+                goto out;
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        if (!offset_in_entry(entry, file_offset))
+                entry = NULL;
+        if (entry)
+                atomic_inc(&entry->refs);
+out:
+        mutex_unlock(&tree->mutex);
+        return entry;
+}
+/*
+ * lookup and return any extent before 'file_offset'.  NULL is returned
+ * if none is found
+ */
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        node = tree_search(tree, file_offset);
+        if (!node)
+                goto out;
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        atomic_inc(&entry->refs);
+out:
+        mutex_unlock(&tree->mutex);
+        return entry;
+}
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size.  i_size is updated to cover any fully written part of the file.
+ */
+int btrfs_ordered_update_i_size(struct inode *inode,
+                                struct btrfs_ordered_extent *ordered)
+{
+        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        u64 disk_i_size;
+        u64 new_i_size;
+        u64 i_size_test;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *test;
+        mutex_lock(&tree->mutex);
+        disk_i_size = BTRFS_I(inode)->disk_i_size;
+        /*
+         * if the disk i_size is already at the inode->i_size, or
+         * this ordered extent is inside the disk i_size, we're done
+         */
+        if (disk_i_size >= inode->i_size ||
+            ordered->file_offset + ordered->len <= disk_i_size) {
+                goto out;
+        }
+        /*
+         * we can't update the disk_isize if there are delalloc bytes
+         * between disk_i_size and  this ordered extent
+         */
+        if (test_range_bit(io_tree, disk_i_size,
+                           ordered->file_offset + ordered->len - 1,
+                           EXTENT_DELALLOC, 0)) {
+                goto out;
+        }
+        /*
+         * walk backward from this ordered extent to disk_i_size.
+         * if we find an ordered extent then we can't update disk i_size
+         * yet
+         */
+        node = &ordered->rb_node;
+        while (1) {
+                node = rb_prev(node);
+                if (!node)
+                        break;
+                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+                if (test->file_offset + test->len <= disk_i_size)
+                        break;
+                if (test->file_offset >= inode->i_size)
+                        break;
+                if (test->file_offset >= disk_i_size)
+                        goto out;
+        }
+        new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
+        /*
+         * at this point, we know we can safely update i_size to at least
+         * the offset from this ordered extent.  But, we need to
+         * walk forward and see if ios from higher up in the file have
+         * finished.
+         */
+        node = rb_next(&ordered->rb_node);
+        i_size_test = 0;
+        if (node) {
+                /*
+                 * do we have an area where IO might have finished
+                 * between our ordered extent and the next one.
+                 */
+                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+                if (test->file_offset > entry_end(ordered))
+                        i_size_test = test->file_offset;
+        } else {
+                i_size_test = i_size_read(inode);
+        }
+        /*
+         * i_size_test is the end of a region after this ordered
+         * extent where there are no ordered extents.  As long as there
+         * are no delalloc bytes in this area, it is safe to update
+         * disk_i_size to the end of the region.
+         */
+        if (i_size_test > entry_end(ordered) &&
+            !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
+                           EXTENT_DELALLOC, 0)) {
+                new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+        }
+        BTRFS_I(inode)->disk_i_size = new_i_size;
+out:
+        mutex_unlock(&tree->mutex);
+        return 0;
+}
+/*
+ * search the ordered extents for one corresponding to 'offset' and
+ * try to find a checksum.  This is used because we allow pages to
+ * be reclaimed before their checksum is actually put into the btree
+ */
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+                           u32 *sum)
+{
+        struct btrfs_ordered_sum *ordered_sum;
+        struct btrfs_sector_sum *sector_sums;
+        struct btrfs_ordered_extent *ordered;
+        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+        struct list_head *cur;
+        unsigned long num_sectors;
+        unsigned long i;
+        u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
+        int ret = 1;
+        ordered = btrfs_lookup_ordered_extent(inode, offset);
+        if (!ordered)
+                return 1;
+        mutex_lock(&tree->mutex);
+        list_for_each_prev(cur, &ordered->list) {
+                ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
+                if (disk_bytenr >= ordered_sum->bytenr) {
+                        num_sectors = ordered_sum->len / sectorsize;
+                        sector_sums = ordered_sum->sums;
+                        for (i = 0; i < num_sectors; i++) {
+                                if (sector_sums[i].bytenr == disk_bytenr) {
+                                        *sum = sector_sums[i].sum;
+                                        ret = 0;
+                                        goto out;
+                                }
+                        }
+                }
+        }
+out:
+        mutex_unlock(&tree->mutex);
+        btrfs_put_ordered_extent(ordered);
+        return ret;
+}
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping:    address space structure to write
+ * @start:      offset in bytes where the range starts
+ * @end:        offset in bytes where the range ends (inclusive)
+ * @sync_mode:  enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback.  The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+                           loff_t end, int sync_mode)
+{
+        struct writeback_control wbc = {
+                .sync_mode = sync_mode,
+                .nr_to_write = mapping->nrpages * 2,
+                .range_start = start,
+                .range_end = end,
+                .for_writepages = 1,
+        };
+        return btrfs_writepages(mapping, &wbc);
+}
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping:    target address_space
+ * @start:      beginning page index
+ * @end:        ending page index
+ *
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
+ */
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+                                       pgoff_t start, pgoff_t end)
+{
+        struct pagevec pvec;
+        int nr_pages;
+        int ret = 0;
+        pgoff_t index;
+        if (end < start)
+                return 0;
+        pagevec_init(&pvec, 0);
+        index = start;
+        while ((index <= end) &&
+                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                        PAGECACHE_TAG_WRITEBACK,
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+                unsigned i;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /* until radix tree lookup accepts end_index */
+                        if (page->index > end)
+                                continue;
+                        wait_on_page_writeback(page);
+                        if (PageError(page))
+                                ret = -EIO;
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        /* Check for outstanding write errors */
+        if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+                ret = -ENOSPC;
+        if (test_and_clear_bit(AS_EIO, &mapping->flags))
+                ret = -EIO;
+        return ret;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 000000000000..ab66d5e8d6d6
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+/* one of these per inode */
+struct btrfs_ordered_inode_tree {
+        struct mutex mutex;
+        struct rb_root tree;
+        struct rb_node *last;
+};
+/*
+ * these are used to collect checksums done just before bios submission.
+ * They are attached via a list into the ordered extent, and
+ * checksum items are inserted into the tree after all the blocks in
+ * the ordered extent are on disk
+ */
+struct btrfs_sector_sum {
+        /* bytenr on disk */
+        u64 bytenr;
+        u32 sum;
+};
+struct btrfs_ordered_sum {
+        /* bytenr is the start of this extent on disk */
+        u64 bytenr;
+        /*
+         * this is the length in bytes covered by the sums array below.
+         */
+        unsigned long len;
+        struct list_head list;
+        /* last field is a variable length array of btrfs_sector_sums */
+        struct btrfs_sector_sum sums[];
+};
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters.  It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+struct btrfs_ordered_extent {
+        /* logical offset in the file */
+        u64 file_offset;
+        /* disk byte number */
+        u64 start;
+        /* ram length of the extent in bytes */
+        u64 len;
+        /* extent length on disk */
+        u64 disk_len;
+        /* flags (described above) */
+        unsigned long flags;
+        /* reference count */
+        atomic_t refs;
+        /* the inode we belong to */
+        struct inode *inode;
+        /* list of checksums for insertion when the extent io is done */
+        struct list_head list;
+        /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
+        wait_queue_head_t wait;
+        /* our friendly rbtree entry */
+        struct rb_node rb_node;
+        /* a per root list of all the pending ordered extents */
+        struct list_head root_extent_list;
+};
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+                                         unsigned long bytes)
+{
+        unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+                root->sectorsize;
+        num_sectors++;
+        return sizeof(struct btrfs_ordered_sum) +
+                num_sectors * sizeof(struct btrfs_sector_sum);
+}
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+        mutex_init(&t->mutex);
+        t->tree.rb_node = NULL;
+        t->last = NULL;
+}
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+                                struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+                                       u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+                             u64 start, u64 len, u64 disk_len, int tyep);
+int btrfs_add_ordered_sum(struct inode *inode,
+                          struct btrfs_ordered_extent *entry,
+                          struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+                                                         u64 file_offset);
+void btrfs_start_ordered_extent(struct inode *inode,
+                                struct btrfs_ordered_extent *entry, int wait);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+int btrfs_ordered_update_i_size(struct inode *inode,
+                                struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+                                       pgoff_t start, pgoff_t end);
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+                           loff_t end, int sync_mode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+#endif
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 000000000000..3c0d52af4f80
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 offset)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret = 0;
+        key.objectid = BTRFS_ORPHAN_OBJECTID;
+        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.offset = offset;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 offset)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret = 0;
+        key.objectid = BTRFS_ORPHAN_OBJECTID;
+        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.offset = offset;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret)
+                goto out;
+        ret = btrfs_del_item(trans, root, path);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 000000000000..5f8f218c1005
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+        int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+        int i;
+        printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+               "num_stripes %d\n",
+               (unsigned long long)btrfs_chunk_length(eb, chunk),
+               (unsigned long long)btrfs_chunk_owner(eb, chunk),
+               (unsigned long long)btrfs_chunk_type(eb, chunk),
+               num_stripes);
+        for (i = 0 ; i < num_stripes ; i++) {
+                printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
+                      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
+                      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+        }
+}
+static void print_dev_item(struct extent_buffer *eb,
+                           struct btrfs_dev_item *dev_item)
+{
+        printk(KERN_INFO "\t\tdev item devid %llu "
+               "total_bytes %llu bytes used %llu\n",
+               (unsigned long long)btrfs_device_id(eb, dev_item),
+               (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
+               (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+}
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
+{
+        int i;
+        u32 nr = btrfs_header_nritems(l);
+        struct btrfs_item *item;
+        struct btrfs_extent_item *ei;
+        struct btrfs_root_item *ri;
+        struct btrfs_dir_item *di;
+        struct btrfs_inode_item *ii;
+        struct btrfs_block_group_item *bi;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_extent_ref *ref;
+        struct btrfs_dev_extent *dev_extent;
+        u32 type;
+        printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
+                (unsigned long long)btrfs_header_bytenr(l), nr,
+                btrfs_leaf_free_space(root, l));
+        for (i = 0 ; i < nr ; i++) {
+                item = btrfs_item_nr(l, i);
+                btrfs_item_key_to_cpu(l, &key, i);
+                type = btrfs_key_type(&key);
+                printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+                       "itemsize %d\n",
+                        i,
+                        (unsigned long long)key.objectid, type,
+                        (unsigned long long)key.offset,
+                        btrfs_item_offset(l, item), btrfs_item_size(l, item));
+                switch (type) {
+                case BTRFS_INODE_ITEM_KEY:
+                        ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+                        printk(KERN_INFO "\t\tinode generation %llu size %llu "
+                               "mode %o\n",
+                               (unsigned long long)
+                               btrfs_inode_generation(l, ii),
+                              (unsigned long long)btrfs_inode_size(l, ii),
+                               btrfs_inode_mode(l, ii));
+                        break;
+                case BTRFS_DIR_ITEM_KEY:
+                        di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+                        btrfs_dir_item_key_to_cpu(l, di, &found_key);
+                        printk(KERN_INFO "\t\tdir oid %llu type %u\n",
+                                (unsigned long long)found_key.objectid,
+                                btrfs_dir_type(l, di));
+                        break;
+                case BTRFS_ROOT_ITEM_KEY:
+                        ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+                        printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+                                (unsigned long long)
+                                btrfs_disk_root_bytenr(l, ri),
+                                btrfs_disk_root_refs(l, ri));
+                        break;
+                case BTRFS_EXTENT_ITEM_KEY:
+                        ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
+                        printk(KERN_INFO "\t\textent data refs %u\n",
+                                btrfs_extent_refs(l, ei));
+                        break;
+                case BTRFS_EXTENT_REF_KEY:
+                        ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
+                        printk(KERN_INFO "\t\textent back ref root %llu "
+                               "gen %llu owner %llu num_refs %lu\n",
+                               (unsigned long long)btrfs_ref_root(l, ref),
+                               (unsigned long long)btrfs_ref_generation(l, ref),
+                               (unsigned long long)btrfs_ref_objectid(l, ref),
+                               (unsigned long)btrfs_ref_num_refs(l, ref));
+                        break;
+                case BTRFS_EXTENT_DATA_KEY:
+                        fi = btrfs_item_ptr(l, i,
+                                            struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_type(l, fi) ==
+                            BTRFS_FILE_EXTENT_INLINE) {
+                                printk(KERN_INFO "\t\tinline extent data "
+                                       "size %u\n",
+                                       btrfs_file_extent_inline_len(l, fi));
+                                break;
+                        }
+                        printk(KERN_INFO "\t\textent data disk bytenr %llu "
+                               "nr %llu\n",
+                               (unsigned long long)
+                               btrfs_file_extent_disk_bytenr(l, fi),
+                               (unsigned long long)
+                               btrfs_file_extent_disk_num_bytes(l, fi));
+                        printk(KERN_INFO "\t\textent data offset %llu "
+                               "nr %llu ram %llu\n",
+                               (unsigned long long)
+                               btrfs_file_extent_offset(l, fi),
+                               (unsigned long long)
+                               btrfs_file_extent_num_bytes(l, fi),
+                               (unsigned long long)
+                               btrfs_file_extent_ram_bytes(l, fi));
+                        break;
+                case BTRFS_BLOCK_GROUP_ITEM_KEY:
+                        bi = btrfs_item_ptr(l, i,
+                                            struct btrfs_block_group_item);
+                        printk(KERN_INFO "\t\tblock group used %llu\n",
+                               (unsigned long long)
+                               btrfs_disk_block_group_used(l, bi));
+                        break;
+                case BTRFS_CHUNK_ITEM_KEY:
+                        print_chunk(l, btrfs_item_ptr(l, i,
+                                                      struct btrfs_chunk));
+                        break;
+                case BTRFS_DEV_ITEM_KEY:
+                        print_dev_item(l, btrfs_item_ptr(l, i,
+                                        struct btrfs_dev_item));
+                        break;
+                case BTRFS_DEV_EXTENT_KEY:
+                        dev_extent = btrfs_item_ptr(l, i,
+                                                    struct btrfs_dev_extent);
+                        printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
+                               "\t\tchunk objectid %llu chunk offset %llu "
+                               "length %llu\n",
+                               (unsigned long long)
+                               btrfs_dev_extent_chunk_tree(l, dev_extent),
+                               (unsigned long long)
+                               btrfs_dev_extent_chunk_objectid(l, dev_extent),
+                               (unsigned long long)
+                               btrfs_dev_extent_chunk_offset(l, dev_extent),
+                               (unsigned long long)
+                               btrfs_dev_extent_length(l, dev_extent));
+                };
+        }
+}
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
+{
+        int i; u32 nr;
+        struct btrfs_key key;
+        int level;
+        if (!c)
+                return;
+        nr = btrfs_header_nritems(c);
+        level = btrfs_header_level(c);
+        if (level == 0) {
+                btrfs_print_leaf(root, c);
+                return;
+        }
+        printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
+               (unsigned long long)btrfs_header_bytenr(c),
+               btrfs_header_level(c), nr,
+               (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+        for (i = 0; i < nr; i++) {
+                btrfs_node_key_to_cpu(c, &key, i);
+                printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
+                       i,
+                       (unsigned long long)key.objectid,
+                       key.type,
+                       (unsigned long long)key.offset,
+                       (unsigned long long)btrfs_node_blockptr(c, i));
+        }
+        for (i = 0; i < nr; i++) {
+                struct extent_buffer *next = read_tree_block(root,
+                                        btrfs_node_blockptr(c, i),
+                                        btrfs_level_size(root, level - 1),
+                                        btrfs_node_ptr_generation(c, i));
+                if (btrfs_is_leaf(next) &&
+                    btrfs_header_level(c) != 1)
+                        BUG();
+                if (btrfs_header_level(next) !=
+                        btrfs_header_level(c) - 1)
+                        BUG();
+                btrfs_print_tree(root, next);
+                free_extent_buffer(next);
+        }
+}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 000000000000..da75efe534d5
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __PRINT_TREE_
+#define __PRINT_TREE_
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 000000000000..6f0acc4c9eab
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+#include "ref-cache.h"
+#include "transaction.h"
+/*
+ * leaf refs are used to cache the information about which extents
+ * a given leaf has references on.  This allows us to process that leaf
+ * in btrfs_drop_snapshot without needing to read it back from disk.
+ */
+/*
+ * kmalloc a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+                                            int nr_extents)
+{
+        struct btrfs_leaf_ref *ref;
+        size_t size = btrfs_leaf_ref_size(nr_extents);
+        ref = kmalloc(size, GFP_NOFS);
+        if (ref) {
+                spin_lock(&root->fs_info->ref_cache_lock);
+                root->fs_info->total_ref_cache_size += size;
+                spin_unlock(&root->fs_info->ref_cache_lock);
+                memset(ref, 0, sizeof(*ref));
+                atomic_set(&ref->usage, 1);
+                INIT_LIST_HEAD(&ref->list);
+        }
+        return ref;
+}
+/*
+ * free a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+        if (!ref)
+                return;
+        WARN_ON(atomic_read(&ref->usage) == 0);
+        if (atomic_dec_and_test(&ref->usage)) {
+                size_t size = btrfs_leaf_ref_size(ref->nritems);
+                BUG_ON(ref->in_tree);
+                kfree(ref);
+                spin_lock(&root->fs_info->ref_cache_lock);
+                root->fs_info->total_ref_cache_size -= size;
+                spin_unlock(&root->fs_info->ref_cache_lock);
+        }
+}
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct btrfs_leaf_ref *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
+                if (bytenr < entry->bytenr)
+                        p = &(*p)->rb_left;
+                else if (bytenr > entry->bytenr)
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+        struct rb_node *n = root->rb_node;
+        struct btrfs_leaf_ref *entry;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
+                WARN_ON(!entry->in_tree);
+                if (bytenr < entry->bytenr)
+                        n = n->rb_left;
+                else if (bytenr > entry->bytenr)
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        return NULL;
+}
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+                           int shared)
+{
+        struct btrfs_leaf_ref *ref = NULL;
+        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+        if (shared)
+                tree = &root->fs_info->shared_ref_tree;
+        if (!tree)
+                return 0;
+        spin_lock(&tree->lock);
+        while (!list_empty(&tree->list)) {
+                ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
+                BUG_ON(ref->tree != tree);
+                if (ref->root_gen > max_root_gen)
+                        break;
+                if (!xchg(&ref->in_tree, 0)) {
+                        cond_resched_lock(&tree->lock);
+                        continue;
+                }
+                rb_erase(&ref->rb_node, &tree->root);
+                list_del_init(&ref->list);
+                spin_unlock(&tree->lock);
+                btrfs_free_leaf_ref(root, ref);
+                cond_resched();
+                spin_lock(&tree->lock);
+        }
+        spin_unlock(&tree->lock);
+        return 0;
+}
+/*
+ * find the leaf ref for a given extent.  This returns the ref struct with
+ * a usage reference incremented
+ */
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+                                             u64 bytenr)
+{
+        struct rb_node *rb;
+        struct btrfs_leaf_ref *ref = NULL;
+        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+again:
+        if (tree) {
+                spin_lock(&tree->lock);
+                rb = tree_search(&tree->root, bytenr);
+                if (rb)
+                        ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+                if (ref)
+                        atomic_inc(&ref->usage);
+                spin_unlock(&tree->lock);
+                if (ref)
+                        return ref;
+        }
+        if (tree != &root->fs_info->shared_ref_tree) {
+                tree = &root->fs_info->shared_ref_tree;
+                goto again;
+        }
+        return NULL;
+}
+/*
+ * add a fully filled in leaf ref struct
+ * remove all the refs older than a given root generation
+ */
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+                       int shared)
+{
+        int ret = 0;
+        struct rb_node *rb;
+        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+        if (shared)
+                tree = &root->fs_info->shared_ref_tree;
+        spin_lock(&tree->lock);
+        rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
+        if (rb) {
+                ret = -EEXIST;
+        } else {
+                atomic_inc(&ref->usage);
+                ref->tree = tree;
+                ref->in_tree = 1;
+                list_add_tail(&ref->list, &tree->list);
+        }
+        spin_unlock(&tree->lock);
+        return ret;
+}
+/*
+ * remove a single leaf ref from the tree.  This drops the ref held by the tree
+ * only
+ */
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+        struct btrfs_leaf_ref_tree *tree;
+        if (!xchg(&ref->in_tree, 0))
+                return 0;
+        tree = ref->tree;
+        spin_lock(&tree->lock);
+        rb_erase(&ref->rb_node, &tree->root);
+        list_del_init(&ref->list);
+        spin_unlock(&tree->lock);
+        btrfs_free_leaf_ref(root, ref);
+        return 0;
+}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 000000000000..16f3183d7c59
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __REFCACHE__
+#define __REFCACHE__
+struct btrfs_extent_info {
+        /* bytenr and num_bytes find the extent in the extent allocation tree */
+        u64 bytenr;
+        u64 num_bytes;
+        /* objectid and offset find the back reference for the file */
+        u64 objectid;
+        u64 offset;
+};
+struct btrfs_leaf_ref {
+        struct rb_node rb_node;
+        struct btrfs_leaf_ref_tree *tree;
+        int in_tree;
+        atomic_t usage;
+        u64 root_gen;
+        u64 bytenr;
+        u64 owner;
+        u64 generation;
+        int nritems;
+        struct list_head list;
+        struct btrfs_extent_info extents[];
+};
+static inline size_t btrfs_leaf_ref_size(int nr_extents)
+{
+        return sizeof(struct btrfs_leaf_ref) +
+               sizeof(struct btrfs_extent_info) * nr_extents;
+}
+static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
+{
+        tree->root.rb_node = NULL;
+        INIT_LIST_HEAD(&tree->list);
+        spin_lock_init(&tree->lock);
+}
+static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
+{
+        return RB_EMPTY_ROOT(&tree->root);
+}
+void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+                                            int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+                                             u64 bytenr);
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+                       int shared);
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+                           int shared);
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 000000000000..b48650de4472
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+/*
+ *  search forward for a root, starting with objectid 'search_start'
+ *  if a root key is found, the objectid we find is filled into 'found_objectid'
+ *  and 0 is returned.  < 0 is returned on error, 1 if there is nothing
+ *  left in the tree.
+ */
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+                      u64 *found_objectid)
+{
+        struct btrfs_path *path;
+        struct btrfs_key search_key;
+        int ret;
+        root = root->fs_info->tree_root;
+        search_key.objectid = search_start;
+        search_key.type = (u8)-1;
+        search_key.offset = (u64)-1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+again:
+        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret == 0) {
+                ret = 1;
+                goto out;
+        }
+        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+                ret = btrfs_next_leaf(root, path);
+                if (ret)
+                        goto out;
+        }
+        btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
+        if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
+                search_key.offset++;
+                btrfs_release_path(root, path);
+                goto again;
+        }
+        ret = 0;
+        *found_objectid = search_key.objectid;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * lookup the root with the highest offset for a given objectid.  The key we do
+ * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
+ * on error.
+ */
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+                        struct btrfs_root_item *item, struct btrfs_key *key)
+{
+        struct btrfs_path *path;
+        struct btrfs_key search_key;
+        struct btrfs_key found_key;
+        struct extent_buffer *l;
+        int ret;
+        int slot;
+        search_key.objectid = objectid;
+        search_key.type = BTRFS_ROOT_ITEM_KEY;
+        search_key.offset = (u64)-1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret == 0);
+        l = path->nodes[0];
+        BUG_ON(path->slots[0] == 0);
+        slot = path->slots[0] - 1;
+        btrfs_item_key_to_cpu(l, &found_key, slot);
+        if (found_key.objectid != objectid) {
+                ret = 1;
+                goto out;
+        }
+        read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+                           sizeof(*item));
+        memcpy(key, &found_key, sizeof(found_key));
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * copy the data in 'item' into the btree
+ */
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_root_item
+                      *item)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *l;
+        int ret;
+        int slot;
+        unsigned long ptr;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+        if (ret < 0)
+                goto out;
+        if (ret != 0) {
+                btrfs_print_leaf(root, path->nodes[0]);
+                printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+                       (unsigned long long)key->objectid, key->type,
+                       (unsigned long long)key->offset);
+                BUG_ON(1);
+        }
+        l = path->nodes[0];
+        slot = path->slots[0];
+        ptr = btrfs_item_ptr_offset(l, slot);
+        write_extent_buffer(l, item, ptr, sizeof(*item));
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+                      *root, struct btrfs_key *key, struct btrfs_root_item
+                      *item)
+{
+        int ret;
+        ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
+        return ret;
+}
+/*
+ * at mount time we want to find all the old transaction snapshots that were in
+ * the process of being deleted if we crashed.  This is any root item with an
+ * offset lower than the latest root.  They need to be queued for deletion to
+ * finish what was happening when we crashed.
+ */
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+                          struct btrfs_root *latest)
+{
+        struct btrfs_root *dead_root;
+        struct btrfs_item *item;
+        struct btrfs_root_item *ri;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_path *path;
+        int ret;
+        u32 nritems;
+        struct extent_buffer *leaf;
+        int slot;
+        key.objectid = objectid;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        key.offset = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+again:
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto err;
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                slot = path->slots[0];
+                if (slot >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret)
+                                break;
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        slot = path->slots[0];
+                }
+                item = btrfs_item_nr(leaf, slot);
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
+                        goto next;
+                if (key.objectid < objectid)
+                        goto next;
+                if (key.objectid > objectid)
+                        break;
+                ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
+                if (btrfs_disk_root_refs(leaf, ri) != 0)
+                        goto next;
+                memcpy(&found_key, &key, sizeof(key));
+                key.offset++;
+                btrfs_release_path(root, path);
+                dead_root =
+                        btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+                                                    &found_key);
+                if (IS_ERR(dead_root)) {
+                        ret = PTR_ERR(dead_root);
+                        goto err;
+                }
+                if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+                        ret = btrfs_add_dead_reloc_root(dead_root);
+                else
+                        ret = btrfs_add_dead_root(dead_root, latest);
+                if (ret)
+                        goto err;
+                goto again;
+next:
+                slot++;
+                path->slots[0]++;
+        }
+        ret = 0;
+err:
+        btrfs_free_path(path);
+        return ret;
+}
+/* drop the root item for 'key' from 'root' */
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                   struct btrfs_key *key)
+{
+        struct btrfs_path *path;
+        int ret;
+        u32 refs;
+        struct btrfs_root_item *ri;
+        struct extent_buffer *leaf;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        ret = btrfs_search_slot(trans, root, key, path, -1, 1);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret != 0);
+        leaf = path->nodes[0];
+        ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
+        refs = btrfs_disk_root_refs(leaf, ri);
+        BUG_ON(refs != 0);
+        ret = btrfs_del_item(trans, root, path);
+out:
+        btrfs_release_path(root, path);
+        btrfs_free_path(path);
+        return ret;
+}
+#if 0 /* this will get used when snapshot deletion is implemented */
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *tree_root,
+                       u64 root_id, u8 type, u64 ref_id)
+{
+        struct btrfs_key key;
+        int ret;
+        struct btrfs_path *path;
+        path = btrfs_alloc_path();
+        key.objectid = root_id;
+        key.type = type;
+        key.offset = ref_id;
+        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+        BUG_ON(ret);
+        ret = btrfs_del_item(trans, tree_root, path);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return ret;
+}
+#endif
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+                   struct btrfs_path *path,
+                   u64 root_id, u64 ref_id)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = root_id;
+        key.type = BTRFS_ROOT_REF_KEY;
+        key.offset = ref_id;
+        ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+        return ret;
+}
+/*
+ * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol  or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *tree_root,
+                       u64 root_id, u8 type, u64 ref_id,
+                       u64 dirid, u64 sequence,
+                       const char *name, int name_len)
+{
+        struct btrfs_key key;
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_root_ref *ref;
+        struct extent_buffer *leaf;
+        unsigned long ptr;
+        path = btrfs_alloc_path();
+        key.objectid = root_id;
+        key.type = type;
+        key.offset = ref_id;
+        ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+                                      sizeof(*ref) + name_len);
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+        btrfs_set_root_ref_dirid(leaf, ref, dirid);
+        btrfs_set_root_ref_sequence(leaf, ref, sequence);
+        btrfs_set_root_ref_name_len(leaf, ref, name_len);
+        ptr = (unsigned long)(ref + 1);
+        write_extent_buffer(leaf, name, ptr, name_len);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        return ret;
+}
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 000000000000..c0f7ecaf1e79
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/highmem.h>
+/* this is some deeply nasty code.  ctree.h has a different
+ * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions
+ *
+ * This file declares the macros and then #includes ctree.h, which results
+ * in cpp creating the function here based on the template below.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers.  Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type.  This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do all the kmapping and page
+ * spanning work required to get extent buffers in highmem and have
+ * a metadata blocksize different from the page size.
+ *
+ * The macro starts with a simple function prototype declaration so that
+ * sparse won't complain about it being static.
+ */
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)                    \
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);                \
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);  \
+u##bits btrfs_##name(struct extent_buffer *eb,                          \
+                                   type *s)                             \
+{                                                                       \
+        unsigned long part_offset = (unsigned long)s;                   \
+        unsigned long offset = part_offset + offsetof(type, member);    \
+        type *p;                                                        \
+        /* ugly, but we want the fast path here */                      \
+        if (eb->map_token && offset >= eb->map_start &&                 \
+            offset + sizeof(((type *)0)->member) <= eb->map_start +     \
+            eb->map_len) {                                              \
+                p = (type *)(eb->kaddr + part_offset - eb->map_start);  \
+                return le##bits##_to_cpu(p->member);                    \
+        }                                                               \
+        {                                                               \
+                int err;                                                \
+                char *map_token;                                        \
+                char *kaddr;                                            \
+                int unmap_on_exit = (eb->map_token == NULL);            \
+                unsigned long map_start;                                \
+                unsigned long map_len;                                  \
+                u##bits res;                                            \
+                err = map_extent_buffer(eb, offset,                     \
+                                sizeof(((type *)0)->member),            \
+                                &map_token, &kaddr,                     \
+                                &map_start, &map_len, KM_USER1);        \
+                if (err) {                                              \
+                        __le##bits leres;                               \
+                        read_eb_member(eb, s, type, member, &leres);    \
+                        return le##bits##_to_cpu(leres);                \
+                }                                                       \
+                p = (type *)(kaddr + part_offset - map_start);          \
+                res = le##bits##_to_cpu(p->member);                     \
+                if (unmap_on_exit)                                      \
+                        unmap_extent_buffer(eb, map_token, KM_USER1);   \
+                return res;                                             \
+        }                                                               \
+}                                                                       \
+void btrfs_set_##name(struct extent_buffer *eb,                         \
+                                    type *s, u##bits val)               \
+{                                                                       \
+        unsigned long part_offset = (unsigned long)s;                   \
+        unsigned long offset = part_offset + offsetof(type, member);    \
+        type *p;                                                        \
+        /* ugly, but we want the fast path here */                      \
+        if (eb->map_token && offset >= eb->map_start &&                 \
+            offset + sizeof(((type *)0)->member) <= eb->map_start +     \
+            eb->map_len) {                                              \
+                p = (type *)(eb->kaddr + part_offset - eb->map_start);  \
+                p->member = cpu_to_le##bits(val);                       \
+                return;                                                 \
+        }                                                               \
+        {                                                               \
+                int err;                                                \
+                char *map_token;                                        \
+                char *kaddr;                                            \
+                int unmap_on_exit = (eb->map_token == NULL);            \
+                unsigned long map_start;                                \
+                unsigned long map_len;                                  \
+                err = map_extent_buffer(eb, offset,                     \
+                                sizeof(((type *)0)->member),            \
+                                &map_token, &kaddr,                     \
+                                &map_start, &map_len, KM_USER1);        \
+                if (err) {                                              \
+                        __le##bits val2;                                \
+                        val2 = cpu_to_le##bits(val);                    \
+                        write_eb_member(eb, s, type, member, &val2);    \
+                        return;                                         \
+                }                                                       \
+                p = (type *)(kaddr + part_offset - map_start);          \
+                p->member = cpu_to_le##bits(val);                       \
+                if (unmap_on_exit)                                      \
+                        unmap_extent_buffer(eb, map_token, KM_USER1);   \
+        }                                                               \
+}
+#include "ctree.h"
+void btrfs_node_key(struct extent_buffer *eb,
+                    struct btrfs_disk_key *disk_key, int nr)
+{
+        unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+        if (eb->map_token && ptr >= eb->map_start &&
+            ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
+                memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
+                        sizeof(*disk_key));
+                return;
+        } else if (eb->map_token) {
+                unmap_extent_buffer(eb, eb->map_token, KM_USER1);
+                eb->map_token = NULL;
+        }
+        read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+                       struct btrfs_key_ptr, key, disk_key);
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 000000000000..0a14b495532f
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,722 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mount.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/parser.h>
+#include <linux/ctype.h>
+#include <linux/namei.h>
+#include <linux/miscdevice.h>
+#include <linux/version.h>
+#include "compat.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "xattr.h"
+#include "volumes.h"
+#include "version.h"
+#include "export.h"
+#include "compression.h"
+#define BTRFS_SUPER_MAGIC 0x9123683E
+static struct super_operations btrfs_super_ops;
+static void btrfs_put_super(struct super_block *sb)
+{
+        struct btrfs_root *root = btrfs_sb(sb);
+        int ret;
+        ret = close_ctree(root);
+        sb->s_fs_info = NULL;
+}
+enum {
+        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
+        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
+};
+static match_table_t tokens = {
+        {Opt_degraded, "degraded"},
+        {Opt_subvol, "subvol=%s"},
+        {Opt_device, "device=%s"},
+        {Opt_nodatasum, "nodatasum"},
+        {Opt_nodatacow, "nodatacow"},
+        {Opt_nobarrier, "nobarrier"},
+        {Opt_max_extent, "max_extent=%s"},
+        {Opt_max_inline, "max_inline=%s"},
+        {Opt_alloc_start, "alloc_start=%s"},
+        {Opt_thread_pool, "thread_pool=%d"},
+        {Opt_compress, "compress"},
+        {Opt_ssd, "ssd"},
+        {Opt_noacl, "noacl"},
+        {Opt_err, NULL},
+};
+u64 btrfs_parse_size(char *str)
+{
+        u64 res;
+        int mult = 1;
+        char *end;
+        char last;
+        res = simple_strtoul(str, &end, 10);
+        last = end[0];
+        if (isalpha(last)) {
+                last = tolower(last);
+                switch (last) {
+                case 'g':
+                        mult *= 1024;
+                case 'm':
+                        mult *= 1024;
+                case 'k':
+                        mult *= 1024;
+                }
+                res = res * mult;
+        }
+        return res;
+}
+/*
+ * Regular mount options parser.  Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ */
+int btrfs_parse_options(struct btrfs_root *root, char *options)
+{
+        struct btrfs_fs_info *info = root->fs_info;
+        substring_t args[MAX_OPT_ARGS];
+        char *p, *num;
+        int intarg;
+        if (!options)
+                return 0;
+        /*
+         * strsep changes the string, duplicate it because parse_options
+         * gets called twice
+         */
+        options = kstrdup(options, GFP_NOFS);
+        if (!options)
+                return -ENOMEM;
+        while ((p = strsep(&options, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_degraded:
+                        printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+                        btrfs_set_opt(info->mount_opt, DEGRADED);
+                        break;
+                case Opt_subvol:
+                case Opt_device:
+                        /*
+                         * These are parsed by btrfs_parse_early_options
+                         * and can be happily ignored here.
+                         */
+                        break;
+                case Opt_nodatasum:
+                        printk(KERN_INFO "btrfs: setting nodatacsum\n");
+                        btrfs_set_opt(info->mount_opt, NODATASUM);
+                        break;
+                case Opt_nodatacow:
+                        printk(KERN_INFO "btrfs: setting nodatacow\n");
+                        btrfs_set_opt(info->mount_opt, NODATACOW);
+                        btrfs_set_opt(info->mount_opt, NODATASUM);
+                        break;
+                case Opt_compress:
+                        printk(KERN_INFO "btrfs: use compression\n");
+                        btrfs_set_opt(info->mount_opt, COMPRESS);
+                        break;
+                case Opt_ssd:
+                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
+                        btrfs_set_opt(info->mount_opt, SSD);
+                        break;
+                case Opt_nobarrier:
+                        printk(KERN_INFO "btrfs: turning off barriers\n");
+                        btrfs_set_opt(info->mount_opt, NOBARRIER);
+                        break;
+                case Opt_thread_pool:
+                        intarg = 0;
+                        match_int(&args[0], &intarg);
+                        if (intarg) {
+                                info->thread_pool_size = intarg;
+                                printk(KERN_INFO "btrfs: thread pool %d\n",
+                                       info->thread_pool_size);
+                        }
+                        break;
+                case Opt_max_extent:
+                        num = match_strdup(&args[0]);
+                        if (num) {
+                                info->max_extent = btrfs_parse_size(num);
+                                kfree(num);
+                                info->max_extent = max_t(u64,
+                                        info->max_extent, root->sectorsize);
+                                printk(KERN_INFO "btrfs: max_extent at %llu\n",
+                                       info->max_extent);
+                        }
+                        break;
+                case Opt_max_inline:
+                        num = match_strdup(&args[0]);
+                        if (num) {
+                                info->max_inline = btrfs_parse_size(num);
+                                kfree(num);
+                                if (info->max_inline) {
+                                        info->max_inline = max_t(u64,
+                                                info->max_inline,
+                                                root->sectorsize);
+                                }
+                                printk(KERN_INFO "btrfs: max_inline at %llu\n",
+                                        info->max_inline);
+                        }
+                        break;
+                case Opt_alloc_start:
+                        num = match_strdup(&args[0]);
+                        if (num) {
+                                info->alloc_start = btrfs_parse_size(num);
+                                kfree(num);
+                                printk(KERN_INFO
+                                        "btrfs: allocations start at %llu\n",
+                                        info->alloc_start);
+                        }
+                        break;
+                case Opt_noacl:
+                        root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+                        break;
+                default:
+                        break;
+                }
+        }
+        kfree(options);
+        return 0;
+}
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
+                void *holder, char **subvol_name,
+                struct btrfs_fs_devices **fs_devices)
+{
+        substring_t args[MAX_OPT_ARGS];
+        char *opts, *p;
+        int error = 0;
+        if (!options)
+                goto out;
+        /*
+         * strsep changes the string, duplicate it because parse_options
+         * gets called twice
+         */
+        opts = kstrdup(options, GFP_KERNEL);
+        if (!opts)
+                return -ENOMEM;
+        while ((p = strsep(&opts, ",")) != NULL) {
+                int token;
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_subvol:
+                        *subvol_name = match_strdup(&args[0]);
+                        break;
+                case Opt_device:
+                        error = btrfs_scan_one_device(match_strdup(&args[0]),
+                                        flags, holder, fs_devices);
+                        if (error)
+                                goto out_free_opts;
+                        break;
+                default:
+                        break;
+                }
+        }
+ out_free_opts:
+        kfree(opts);
+ out:
+        /*
+         * If no subvolume name is specified we use the default one.  Allocate
+         * a copy of the string "." here so that code later in the
+         * mount path doesn't care if it's the default volume or another one.
+         */
+        if (!*subvol_name) {
+                *subvol_name = kstrdup(".", GFP_KERNEL);
+                if (!*subvol_name)
+                        return -ENOMEM;
+        }
+        return error;
+}
+static int btrfs_fill_super(struct super_block *sb,
+                            struct btrfs_fs_devices *fs_devices,
+                            void *data, int silent)
+{
+        struct inode *inode;
+        struct dentry *root_dentry;
+        struct btrfs_super_block *disk_super;
+        struct btrfs_root *tree_root;
+        struct btrfs_inode *bi;
+        int err;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_magic = BTRFS_SUPER_MAGIC;
+        sb->s_op = &btrfs_super_ops;
+        sb->s_export_op = &btrfs_export_ops;
+        sb->s_xattr = btrfs_xattr_handlers;
+        sb->s_time_gran = 1;
+        sb->s_flags |= MS_POSIXACL;
+        tree_root = open_ctree(sb, fs_devices, (char *)data);
+        if (IS_ERR(tree_root)) {
+                printk("btrfs: open_ctree failed\n");
+                return PTR_ERR(tree_root);
+        }
+        sb->s_fs_info = tree_root;
+        disk_super = &tree_root->fs_info->super_copy;
+        inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
+                                  tree_root->fs_info->fs_root);
+        bi = BTRFS_I(inode);
+        bi->location.objectid = inode->i_ino;
+        bi->location.offset = 0;
+        bi->root = tree_root->fs_info->fs_root;
+        btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
+        if (!inode) {
+                err = -ENOMEM;
+                goto fail_close;
+        }
+        if (inode->i_state & I_NEW) {
+                btrfs_read_locked_inode(inode);
+                unlock_new_inode(inode);
+        }
+        root_dentry = d_alloc_root(inode);
+        if (!root_dentry) {
+                iput(inode);
+                err = -ENOMEM;
+                goto fail_close;
+        }
+#if 0
+        /* this does the super kobj at the same time */
+        err = btrfs_sysfs_add_super(tree_root->fs_info);
+        if (err)
+                goto fail_close;
+#endif
+        sb->s_root = root_dentry;
+        save_mount_options(sb, data);
+        return 0;
+fail_close:
+        close_ctree(tree_root);
+        return err;
+}
+int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        int ret;
+        root = btrfs_sb(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return 0;
+        sb->s_dirt = 0;
+        if (!wait) {
+                filemap_flush(root->fs_info->btree_inode->i_mapping);
+                return 0;
+        }
+        btrfs_start_delalloc_inodes(root);
+        btrfs_wait_ordered_extents(root, 0);
+        btrfs_clean_old_snapshots(root);
+        trans = btrfs_start_transaction(root, 1);
+        ret = btrfs_commit_transaction(trans, root);
+        sb->s_dirt = 0;
+        return ret;
+}
+static void btrfs_write_super(struct super_block *sb)
+{
+        sb->s_dirt = 0;
+}
+static int btrfs_test_super(struct super_block *s, void *data)
+{
+        struct btrfs_fs_devices *test_fs_devices = data;
+        struct btrfs_root *root = btrfs_sb(s);
+        return root->fs_info->fs_devices == test_fs_devices;
+}
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
+ *        for multiple device setup.  Make sure to keep it in sync.
+ */
+static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+                const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        char *subvol_name = NULL;
+        struct block_device *bdev = NULL;
+        struct super_block *s;
+        struct dentry *root;
+        struct btrfs_fs_devices *fs_devices = NULL;
+        fmode_t mode = FMODE_READ;
+        int error = 0;
+        if (!(flags & MS_RDONLY))
+                mode |= FMODE_WRITE;
+        error = btrfs_parse_early_options(data, mode, fs_type,
+                                          &subvol_name, &fs_devices);
+        if (error)
+                return error;
+        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
+        if (error)
+                goto error_free_subvol_name;
+        error = btrfs_open_devices(fs_devices, mode, fs_type);
+        if (error)
+                goto error_free_subvol_name;
+        if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+                error = -EACCES;
+                goto error_close_devices;
+        }
+        bdev = fs_devices->latest_bdev;
+        s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+        if (IS_ERR(s))
+                goto error_s;
+        if (s->s_root) {
+                if ((flags ^ s->s_flags) & MS_RDONLY) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        error = -EBUSY;
+                        goto error_close_devices;
+                }
+                btrfs_close_devices(fs_devices);
+        } else {
+                char b[BDEVNAME_SIZE];
+                s->s_flags = flags;
+                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+                error = btrfs_fill_super(s, fs_devices, data,
+                                         flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        goto error_free_subvol_name;
+                }
+                btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+                s->s_flags |= MS_ACTIVE;
+        }
+        if (!strcmp(subvol_name, "."))
+                root = dget(s->s_root);
+        else {
+                mutex_lock(&s->s_root->d_inode->i_mutex);
+                root = lookup_one_len(subvol_name, s->s_root,
+                                      strlen(subvol_name));
+                mutex_unlock(&s->s_root->d_inode->i_mutex);
+                if (IS_ERR(root)) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        error = PTR_ERR(root);
+                        goto error_free_subvol_name;
+                }
+                if (!root->d_inode) {
+                        dput(root);
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        error = -ENXIO;
+                        goto error_free_subvol_name;
+                }
+        }
+        mnt->mnt_sb = s;
+        mnt->mnt_root = root;
+        kfree(subvol_name);
+        return 0;
+error_s:
+        error = PTR_ERR(s);
+error_close_devices:
+        btrfs_close_devices(fs_devices);
+error_free_subvol_name:
+        kfree(subvol_name);
+        return error;
+}
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        struct btrfs_root *root = btrfs_sb(sb);
+        int ret;
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+                return 0;
+        if (*flags & MS_RDONLY) {
+                sb->s_flags |= MS_RDONLY;
+                ret =  btrfs_commit_super(root);
+                WARN_ON(ret);
+        } else {
+                if (root->fs_info->fs_devices->rw_devices == 0)
+                        return -EACCES;
+                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+                        return -EINVAL;
+                ret = btrfs_cleanup_reloc_trees(root);
+                WARN_ON(ret);
+                ret = btrfs_cleanup_fs_roots(root->fs_info);
+                WARN_ON(ret);
+                sb->s_flags &= ~MS_RDONLY;
+        }
+        return 0;
+}
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+        int bits = dentry->d_sb->s_blocksize_bits;
+        __be32 *fsid = (__be32 *)root->fs_info->fsid;
+        buf->f_namelen = BTRFS_NAME_LEN;
+        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
+        buf->f_bfree = buf->f_blocks -
+                (btrfs_super_bytes_used(disk_super) >> bits);
+        buf->f_bavail = buf->f_bfree;
+        buf->f_bsize = dentry->d_sb->s_blocksize;
+        buf->f_type = BTRFS_SUPER_MAGIC;
+        /* We treat it as constant endianness (it doesn't matter _which_)
+           because we want the fsid to come out the same whether mounted
+           on a big-endian or little-endian host */
+        buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+        buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+        /* Mask in the root object ID too, to disambiguate subvols */
+        buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
+        buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
+        return 0;
+}
+static struct file_system_type btrfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "btrfs",
+        .get_sb         = btrfs_get_sb,
+        .kill_sb        = kill_anon_super,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+                                unsigned long arg)
+{
+        struct btrfs_ioctl_vol_args *vol;
+        struct btrfs_fs_devices *fs_devices;
+        int ret = 0;
+        int len;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+        if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
+                ret = -EFAULT;
+                goto out;
+        }
+        len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
+        switch (cmd) {
+        case BTRFS_IOC_SCAN_DEV:
+                ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+                                            &btrfs_fs_type, &fs_devices);
+                break;
+        }
+out:
+        kfree(vol);
+        return ret;
+}
+static int btrfs_freeze(struct super_block *sb)
+{
+        struct btrfs_root *root = btrfs_sb(sb);
+        mutex_lock(&root->fs_info->transaction_kthread_mutex);
+        mutex_lock(&root->fs_info->cleaner_mutex);
+        return 0;
+}
+static int btrfs_unfreeze(struct super_block *sb)
+{
+        struct btrfs_root *root = btrfs_sb(sb);
+        mutex_unlock(&root->fs_info->cleaner_mutex);
+        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+        return 0;
+}
+static struct super_operations btrfs_super_ops = {
+        .delete_inode   = btrfs_delete_inode,
+        .put_super      = btrfs_put_super,
+        .write_super    = btrfs_write_super,
+        .sync_fs        = btrfs_sync_fs,
+        .show_options   = generic_show_options,
+        .write_inode    = btrfs_write_inode,
+        .dirty_inode    = btrfs_dirty_inode,
+        .alloc_inode    = btrfs_alloc_inode,
+        .destroy_inode  = btrfs_destroy_inode,
+        .statfs         = btrfs_statfs,
+        .remount_fs     = btrfs_remount,
+        .freeze_fs      = btrfs_freeze,
+        .unfreeze_fs    = btrfs_unfreeze,
+};
+static const struct file_operations btrfs_ctl_fops = {
+        .unlocked_ioctl  = btrfs_control_ioctl,
+        .compat_ioctl = btrfs_control_ioctl,
+        .owner   = THIS_MODULE,
+};
+static struct miscdevice btrfs_misc = {
+        .minor          = MISC_DYNAMIC_MINOR,
+        .name           = "btrfs-control",
+        .fops           = &btrfs_ctl_fops
+};
+static int btrfs_interface_init(void)
+{
+        return misc_register(&btrfs_misc);
+}
+static void btrfs_interface_exit(void)
+{
+        if (misc_deregister(&btrfs_misc) < 0)
+                printk(KERN_INFO "misc_deregister failed for control device");
+}
+static int __init init_btrfs_fs(void)
+{
+        int err;
+        err = btrfs_init_sysfs();
+        if (err)
+                return err;
+        err = btrfs_init_cachep();
+        if (err)
+                goto free_sysfs;
+        err = extent_io_init();
+        if (err)
+                goto free_cachep;
+        err = extent_map_init();
+        if (err)
+                goto free_extent_io;
+        err = btrfs_interface_init();
+        if (err)
+                goto free_extent_map;
+        err = register_filesystem(&btrfs_fs_type);
+        if (err)
+                goto unregister_ioctl;
+        printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
+        return 0;
+unregister_ioctl:
+        btrfs_interface_exit();
+free_extent_map:
+        extent_map_exit();
+free_extent_io:
+        extent_io_exit();
+free_cachep:
+        btrfs_destroy_cachep();
+free_sysfs:
+        btrfs_exit_sysfs();
+        return err;
+}
+static void __exit exit_btrfs_fs(void)
+{
+        btrfs_destroy_cachep();
+        extent_map_exit();
+        extent_io_exit();
+        btrfs_interface_exit();
+        unregister_filesystem(&btrfs_fs_type);
+        btrfs_exit_sysfs();
+        btrfs_cleanup_fs_uuids();
+        btrfs_zlib_exit();
+}
+module_init(init_btrfs_fs)
+module_exit(exit_btrfs_fs)
+MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 000000000000..a240b6fa81df
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_root_used(&root->root_item));
+}
+static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_root_limit(&root->root_item));
+}
+static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
+}
+static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
+}
+static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
+}
+/* this is for root attrs (subvols/snapshots) */
+struct btrfs_root_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct btrfs_root *, char *);
+        ssize_t (*store)(struct btrfs_root *, const char *, size_t);
+};
+#define ROOT_ATTR(name, mode, show, store) \
+static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
+                                                              show, store)
+ROOT_ATTR(blocks_used,  0444,   root_blocks_used_show,  NULL);
+ROOT_ATTR(block_limit,  0644,   root_block_limit_show,  NULL);
+static struct attribute *btrfs_root_attrs[] = {
+        &btrfs_root_attr_blocks_used.attr,
+        &btrfs_root_attr_block_limit.attr,
+        NULL,
+};
+/* this is for super attrs (actual full fs) */
+struct btrfs_super_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct btrfs_fs_info *, char *);
+        ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
+};
+#define SUPER_ATTR(name, mode, show, store) \
+static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
+                                                                show, store)
+SUPER_ATTR(blocks_used,         0444,   super_blocks_used_show,         NULL);
+SUPER_ATTR(total_blocks,        0444,   super_total_blocks_show,        NULL);
+SUPER_ATTR(blocksize,           0444,   super_blocksize_show,           NULL);
+static struct attribute *btrfs_super_attrs[] = {
+        &btrfs_super_attr_blocks_used.attr,
+        &btrfs_super_attr_total_blocks.attr,
+        &btrfs_super_attr_blocksize.attr,
+        NULL,
+};
+static ssize_t btrfs_super_attr_show(struct kobject *kobj,
+                                    struct attribute *attr, char *buf)
+{
+        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+                                                super_kobj);
+        struct btrfs_super_attr *a = container_of(attr,
+                                                  struct btrfs_super_attr,
+                                                  attr);
+        return a->show ? a->show(fs, buf) : 0;
+}
+static ssize_t btrfs_super_attr_store(struct kobject *kobj,
+                                     struct attribute *attr,
+                                     const char *buf, size_t len)
+{
+        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+                                                super_kobj);
+        struct btrfs_super_attr *a = container_of(attr,
+                                                  struct btrfs_super_attr,
+                                                  attr);
+        return a->store ? a->store(fs, buf, len) : 0;
+}
+static ssize_t btrfs_root_attr_show(struct kobject *kobj,
+                                    struct attribute *attr, char *buf)
+{
+        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+                                                root_kobj);
+        struct btrfs_root_attr *a = container_of(attr,
+                                                 struct btrfs_root_attr,
+                                                 attr);
+        return a->show ? a->show(root, buf) : 0;
+}
+static ssize_t btrfs_root_attr_store(struct kobject *kobj,
+                                     struct attribute *attr,
+                                     const char *buf, size_t len)
+{
+        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+                                                root_kobj);
+        struct btrfs_root_attr *a = container_of(attr,
+                                                 struct btrfs_root_attr,
+                                                 attr);
+        return a->store ? a->store(root, buf, len) : 0;
+}
+static void btrfs_super_release(struct kobject *kobj)
+{
+        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+                                                super_kobj);
+        complete(&fs->kobj_unregister);
+}
+static void btrfs_root_release(struct kobject *kobj)
+{
+        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+                                                root_kobj);
+        complete(&root->kobj_unregister);
+}
+static struct sysfs_ops btrfs_super_attr_ops = {
+        .show   = btrfs_super_attr_show,
+        .store  = btrfs_super_attr_store,
+};
+static struct sysfs_ops btrfs_root_attr_ops = {
+        .show   = btrfs_root_attr_show,
+        .store  = btrfs_root_attr_store,
+};
+static struct kobj_type btrfs_root_ktype = {
+        .default_attrs  = btrfs_root_attrs,
+        .sysfs_ops      = &btrfs_root_attr_ops,
+        .release        = btrfs_root_release,
+};
+static struct kobj_type btrfs_super_ktype = {
+        .default_attrs  = btrfs_super_attrs,
+        .sysfs_ops      = &btrfs_super_attr_ops,
+        .release        = btrfs_super_release,
+};
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
+{
+        int error;
+        char *name;
+        char c;
+        int len = strlen(fs->sb->s_id) + 1;
+        int i;
+        name = kmalloc(len, GFP_NOFS);
+        if (!name) {
+                error = -ENOMEM;
+                goto fail;
+        }
+        for (i = 0; i < len; i++) {
+                c = fs->sb->s_id[i];
+                if (c == '/' || c == '\\')
+                        c = '!';
+                name[i] = c;
+        }
+        name[len] = '\0';
+        fs->super_kobj.kset = btrfs_kset;
+        error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
+                                     NULL, "%s", name);
+        kfree(name);
+        if (error)
+                goto fail;
+        return 0;
+fail:
+        printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
+        return error;
+}
+int btrfs_sysfs_add_root(struct btrfs_root *root)
+{
+        int error;
+        error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
+                                     &root->fs_info->super_kobj,
+                                     "%s", root->name);
+        if (error)
+                goto fail;
+        return 0;
+fail:
+        printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
+        return error;
+}
+void btrfs_sysfs_del_root(struct btrfs_root *root)
+{
+        kobject_put(&root->root_kobj);
+        wait_for_completion(&root->kobj_unregister);
+}
+void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
+{
+        kobject_put(&fs->super_kobj);
+        wait_for_completion(&fs->kobj_unregister);
+}
+int btrfs_init_sysfs(void)
+{
+        btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+        if (!btrfs_kset)
+                return -ENOMEM;
+        return 0;
+}
+void btrfs_exit_sysfs(void)
+{
+        kset_unregister(btrfs_kset);
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 000000000000..8a08f9443340
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,1097 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "locking.h"
+#include "ref-cache.h"
+#include "tree-log.h"
+#define BTRFS_ROOT_TRANS_TAG 0
+static noinline void put_transaction(struct btrfs_transaction *transaction)
+{
+        WARN_ON(transaction->use_count == 0);
+        transaction->use_count--;
+        if (transaction->use_count == 0) {
+                list_del_init(&transaction->list);
+                memset(transaction, 0, sizeof(*transaction));
+                kmem_cache_free(btrfs_transaction_cachep, transaction);
+        }
+}
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
+static noinline int join_transaction(struct btrfs_root *root)
+{
+        struct btrfs_transaction *cur_trans;
+        cur_trans = root->fs_info->running_transaction;
+        if (!cur_trans) {
+                cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
+                                             GFP_NOFS);
+                BUG_ON(!cur_trans);
+                root->fs_info->generation++;
+                root->fs_info->last_alloc = 0;
+                root->fs_info->last_data_alloc = 0;
+                cur_trans->num_writers = 1;
+                cur_trans->num_joined = 0;
+                cur_trans->transid = root->fs_info->generation;
+                init_waitqueue_head(&cur_trans->writer_wait);
+                init_waitqueue_head(&cur_trans->commit_wait);
+                cur_trans->in_commit = 0;
+                cur_trans->blocked = 0;
+                cur_trans->use_count = 1;
+                cur_trans->commit_done = 0;
+                cur_trans->start_time = get_seconds();
+                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+                extent_io_tree_init(&cur_trans->dirty_pages,
+                                     root->fs_info->btree_inode->i_mapping,
+                                     GFP_NOFS);
+                spin_lock(&root->fs_info->new_trans_lock);
+                root->fs_info->running_transaction = cur_trans;
+                spin_unlock(&root->fs_info->new_trans_lock);
+        } else {
+                cur_trans->num_writers++;
+                cur_trans->num_joined++;
+        }
+        return 0;
+}
+/*
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction.  This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
+ */
+noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
+{
+        struct btrfs_dirty_root *dirty;
+        u64 running_trans_id = root->fs_info->running_transaction->transid;
+        if (root->ref_cows && root->last_trans < running_trans_id) {
+                WARN_ON(root == root->fs_info->extent_root);
+                if (root->root_item.refs != 0) {
+                        radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                                   (unsigned long)root->root_key.objectid,
+                                   BTRFS_ROOT_TRANS_TAG);
+                        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+                        BUG_ON(!dirty);
+                        dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+                        BUG_ON(!dirty->root);
+                        dirty->latest_root = root;
+                        INIT_LIST_HEAD(&dirty->list);
+                        root->commit_root = btrfs_root_node(root);
+                        memcpy(dirty->root, root, sizeof(*root));
+                        spin_lock_init(&dirty->root->node_lock);
+                        spin_lock_init(&dirty->root->list_lock);
+                        mutex_init(&dirty->root->objectid_mutex);
+                        mutex_init(&dirty->root->log_mutex);
+                        INIT_LIST_HEAD(&dirty->root->dead_list);
+                        dirty->root->node = root->commit_root;
+                        dirty->root->commit_root = NULL;
+                        spin_lock(&root->list_lock);
+                        list_add(&dirty->root->dead_list, &root->dead_list);
+                        spin_unlock(&root->list_lock);
+                        root->dirty_root = dirty;
+                } else {
+                        WARN_ON(1);
+                }
+                root->last_trans = running_trans_id;
+        }
+        return 0;
+}
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
+static void wait_current_trans(struct btrfs_root *root)
+{
+        struct btrfs_transaction *cur_trans;
+        cur_trans = root->fs_info->running_transaction;
+        if (cur_trans && cur_trans->blocked) {
+                DEFINE_WAIT(wait);
+                cur_trans->use_count++;
+                while (1) {
+                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        if (cur_trans->blocked) {
+                                mutex_unlock(&root->fs_info->trans_mutex);
+                                schedule();
+                                mutex_lock(&root->fs_info->trans_mutex);
+                                finish_wait(&root->fs_info->transaction_wait,
+                                            &wait);
+                        } else {
+                                finish_wait(&root->fs_info->transaction_wait,
+                                            &wait);
+                                break;
+                        }
+                }
+                put_transaction(cur_trans);
+        }
+}
+static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+                                             int num_blocks, int wait)
+{
+        struct btrfs_trans_handle *h =
+                kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        int ret;
+        mutex_lock(&root->fs_info->trans_mutex);
+        if (!root->fs_info->log_root_recovering &&
+            ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
+                wait_current_trans(root);
+        ret = join_transaction(root);
+        BUG_ON(ret);
+        btrfs_record_root_in_trans(root);
+        h->transid = root->fs_info->running_transaction->transid;
+        h->transaction = root->fs_info->running_transaction;
+        h->blocks_reserved = num_blocks;
+        h->blocks_used = 0;
+        h->block_group = 0;
+        h->alloc_exclude_nr = 0;
+        h->alloc_exclude_start = 0;
+        root->fs_info->running_transaction->use_count++;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        return h;
+}
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+                                                   int num_blocks)
+{
+        return start_transaction(root, num_blocks, 1);
+}
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+                                                   int num_blocks)
+{
+        return start_transaction(root, num_blocks, 0);
+}
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+                                                         int num_blocks)
+{
+        return start_transaction(r, num_blocks, 2);
+}
+/* wait for a transaction commit to be fully complete */
+static noinline int wait_for_commit(struct btrfs_root *root,
+                                    struct btrfs_transaction *commit)
+{
+        DEFINE_WAIT(wait);
+        mutex_lock(&root->fs_info->trans_mutex);
+        while (!commit->commit_done) {
+                prepare_to_wait(&commit->commit_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                if (commit->commit_done)
+                        break;
+                mutex_unlock(&root->fs_info->trans_mutex);
+                schedule();
+                mutex_lock(&root->fs_info->trans_mutex);
+        }
+        mutex_unlock(&root->fs_info->trans_mutex);
+        finish_wait(&commit->commit_wait, &wait);
+        return 0;
+}
+/*
+ * rate limit against the drop_snapshot code.  This helps to slow down new
+ * operations if the drop_snapshot code isn't able to keep up.
+ */
+static void throttle_on_drops(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *info = root->fs_info;
+        int harder_count = 0;
+harder:
+        if (atomic_read(&info->throttles)) {
+                DEFINE_WAIT(wait);
+                int thr;
+                thr = atomic_read(&info->throttle_gen);
+                do {
+                        prepare_to_wait(&info->transaction_throttle,
+                                        &wait, TASK_UNINTERRUPTIBLE);
+                        if (!atomic_read(&info->throttles)) {
+                                finish_wait(&info->transaction_throttle, &wait);
+                                break;
+                        }
+                        schedule();
+                        finish_wait(&info->transaction_throttle, &wait);
+                } while (thr == atomic_read(&info->throttle_gen));
+                harder_count++;
+                if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
+                    harder_count < 2)
+                        goto harder;
+                if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
+                    harder_count < 10)
+                        goto harder;
+                if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
+                    harder_count < 20)
+                        goto harder;
+        }
+}
+void btrfs_throttle(struct btrfs_root *root)
+{
+        mutex_lock(&root->fs_info->trans_mutex);
+        if (!root->fs_info->open_ioctl_trans)
+                wait_current_trans(root);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        throttle_on_drops(root);
+}
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, int throttle)
+{
+        struct btrfs_transaction *cur_trans;
+        struct btrfs_fs_info *info = root->fs_info;
+        mutex_lock(&info->trans_mutex);
+        cur_trans = info->running_transaction;
+        WARN_ON(cur_trans != trans->transaction);
+        WARN_ON(cur_trans->num_writers < 1);
+        cur_trans->num_writers--;
+        if (waitqueue_active(&cur_trans->writer_wait))
+                wake_up(&cur_trans->writer_wait);
+        put_transaction(cur_trans);
+        mutex_unlock(&info->trans_mutex);
+        memset(trans, 0, sizeof(*trans));
+        kmem_cache_free(btrfs_trans_handle_cachep, trans);
+        if (throttle)
+                throttle_on_drops(root);
+        return 0;
+}
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        return __btrfs_end_transaction(trans, root, 0);
+}
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root)
+{
+        return __btrfs_end_transaction(trans, root, 1);
+}
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages)
+{
+        int ret;
+        int err = 0;
+        int werr = 0;
+        struct page *page;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        u64 start = 0;
+        u64 end;
+        unsigned long index;
+        while (1) {
+                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                while (start <= end) {
+                        cond_resched();
+                        index = start >> PAGE_CACHE_SHIFT;
+                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                        page = find_get_page(btree_inode->i_mapping, index);
+                        if (!page)
+                                continue;
+                        btree_lock_page_hook(page);
+                        if (!page->mapping) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                continue;
+                        }
+                        if (PageWriteback(page)) {
+                                if (PageDirty(page))
+                                        wait_on_page_writeback(page);
+                                else {
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        continue;
+                                }
+                        }
+                        err = write_one_page(page, 0);
+                        if (err)
+                                werr = err;
+                        page_cache_release(page);
+                }
+        }
+        while (1) {
+                ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+                while (start <= end) {
+                        index = start >> PAGE_CACHE_SHIFT;
+                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                        page = find_get_page(btree_inode->i_mapping, index);
+                        if (!page)
+                                continue;
+                        if (PageDirty(page)) {
+                                btree_lock_page_hook(page);
+                                wait_on_page_writeback(page);
+                                err = write_one_page(page, 0);
+                                if (err)
+                                        werr = err;
+                        }
+                        wait_on_page_writeback(page);
+                        page_cache_release(page);
+                        cond_resched();
+                }
+        }
+        if (err)
+                werr = err;
+        return werr;
+}
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root)
+{
+        if (!trans || !trans->transaction) {
+                struct inode *btree_inode;
+                btree_inode = root->fs_info->btree_inode;
+                return filemap_write_and_wait(btree_inode->i_mapping);
+        }
+        return btrfs_write_and_wait_marked_extents(root,
+                                           &trans->transaction->dirty_pages);
+}
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+        int ret;
+        u64 old_root_bytenr;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
+        btrfs_extent_post_op(trans, root);
+        btrfs_write_dirty_block_groups(trans, root);
+        btrfs_extent_post_op(trans, root);
+        while (1) {
+                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+                if (old_root_bytenr == root->node->start)
+                        break;
+                btrfs_set_root_bytenr(&root->root_item,
+                                       root->node->start);
+                btrfs_set_root_level(&root->root_item,
+                                     btrfs_header_level(root->node));
+                btrfs_set_root_generation(&root->root_item, trans->transid);
+                btrfs_extent_post_op(trans, root);
+                ret = btrfs_update_root(trans, tree_root,
+                                        &root->root_key,
+                                        &root->root_item);
+                BUG_ON(ret);
+                btrfs_write_dirty_block_groups(trans, root);
+                btrfs_extent_post_op(trans, root);
+        }
+        return 0;
+}
+/*
+ * update all the cowonly tree roots on disk
+ */
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct list_head *next;
+        struct extent_buffer *eb;
+        btrfs_extent_post_op(trans, fs_info->tree_root);
+        eb = btrfs_lock_root_node(fs_info->tree_root);
+        btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+        btrfs_tree_unlock(eb);
+        free_extent_buffer(eb);
+        btrfs_extent_post_op(trans, fs_info->tree_root);
+        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
+                next = fs_info->dirty_cowonly_roots.next;
+                list_del_init(next);
+                root = list_entry(next, struct btrfs_root, dirty_list);
+                update_cowonly_root(trans, root);
+        }
+        return 0;
+}
+/*
+ * dead roots are old snapshots that need to be deleted.  This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
+{
+        struct btrfs_dirty_root *dirty;
+        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+        if (!dirty)
+                return -ENOMEM;
+        dirty->root = root;
+        dirty->latest_root = latest;
+        mutex_lock(&root->fs_info->trans_mutex);
+        list_add(&dirty->list, &latest->fs_info->dead_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        return 0;
+}
+/*
+ * at transaction commit time we need to schedule the old roots for
+ * deletion via btrfs_drop_snapshot.  This runs through all the
+ * reference counted roots that were modified in the current
+ * transaction and puts them into the drop list
+ */
+static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
+                                    struct radix_tree_root *radix,
+                                    struct list_head *list)
+{
+        struct btrfs_dirty_root *dirty;
+        struct btrfs_root *gang[8];
+        struct btrfs_root *root;
+        int i;
+        int ret;
+        int err = 0;
+        u32 refs;
+        while (1) {
+                ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+                                                 ARRAY_SIZE(gang),
+                                                 BTRFS_ROOT_TRANS_TAG);
+                if (ret == 0)
+                        break;
+                for (i = 0; i < ret; i++) {
+                        root = gang[i];
+                        radix_tree_tag_clear(radix,
+                                     (unsigned long)root->root_key.objectid,
+                                     BTRFS_ROOT_TRANS_TAG);
+                        BUG_ON(!root->ref_tree);
+                        dirty = root->dirty_root;
+                        btrfs_free_log(trans, root);
+                        btrfs_free_reloc_root(trans, root);
+                        if (root->commit_root == root->node) {
+                                WARN_ON(root->node->start !=
+                                        btrfs_root_bytenr(&root->root_item));
+                                free_extent_buffer(root->commit_root);
+                                root->commit_root = NULL;
+                                root->dirty_root = NULL;
+                                spin_lock(&root->list_lock);
+                                list_del_init(&dirty->root->dead_list);
+                                spin_unlock(&root->list_lock);
+                                kfree(dirty->root);
+                                kfree(dirty);
+                                /* make sure to update the root on disk
+                                 * so we get any updates to the block used
+                                 * counts
+                                 */
+                                err = btrfs_update_root(trans,
+                                                root->fs_info->tree_root,
+                                                &root->root_key,
+                                                &root->root_item);
+                                continue;
+                        }
+                        memset(&root->root_item.drop_progress, 0,
+                               sizeof(struct btrfs_disk_key));
+                        root->root_item.drop_level = 0;
+                        root->commit_root = NULL;
+                        root->dirty_root = NULL;
+                        root->root_key.offset = root->fs_info->generation;
+                        btrfs_set_root_bytenr(&root->root_item,
+                                              root->node->start);
+                        btrfs_set_root_level(&root->root_item,
+                                             btrfs_header_level(root->node));
+                        btrfs_set_root_generation(&root->root_item,
+                                                  root->root_key.offset);
+                        err = btrfs_insert_root(trans, root->fs_info->tree_root,
+                                                &root->root_key,
+                                                &root->root_item);
+                        if (err)
+                                break;
+                        refs = btrfs_root_refs(&dirty->root->root_item);
+                        btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
+                        err = btrfs_update_root(trans, root->fs_info->tree_root,
+                                                &dirty->root->root_key,
+                                                &dirty->root->root_item);
+                        BUG_ON(err);
+                        if (refs == 1) {
+                                list_add(&dirty->list, list);
+                        } else {
+                                WARN_ON(1);
+                                free_extent_buffer(dirty->root->node);
+                                kfree(dirty->root);
+                                kfree(dirty);
+                        }
+                }
+        }
+        return err;
+}
+/*
+ * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
+ * otherwise every leaf in the btree is read and defragged.
+ */
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+{
+        struct btrfs_fs_info *info = root->fs_info;
+        int ret;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr;
+        smp_mb();
+        if (root->defrag_running)
+                return 0;
+        trans = btrfs_start_transaction(root, 1);
+        while (1) {
+                root->defrag_running = 1;
+                ret = btrfs_defrag_leaves(trans, root, cacheonly);
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, root);
+                btrfs_btree_balance_dirty(info->tree_root, nr);
+                cond_resched();
+                trans = btrfs_start_transaction(root, 1);
+                if (root->fs_info->closing || ret != -EAGAIN)
+                        break;
+        }
+        root->defrag_running = 0;
+        smp_mb();
+        btrfs_end_transaction(trans, root);
+        return 0;
+}
+/*
+ * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
+ * all of them
+ */
+static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
+                                     struct list_head *list)
+{
+        struct btrfs_dirty_root *dirty;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr;
+        u64 num_bytes;
+        u64 bytes_used;
+        u64 max_useless;
+        int ret = 0;
+        int err;
+        while (!list_empty(list)) {
+                struct btrfs_root *root;
+                dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
+                list_del_init(&dirty->list);
+                num_bytes = btrfs_root_used(&dirty->root->root_item);
+                root = dirty->latest_root;
+                atomic_inc(&root->fs_info->throttles);
+                while (1) {
+                        trans = btrfs_start_transaction(tree_root, 1);
+                        mutex_lock(&root->fs_info->drop_mutex);
+                        ret = btrfs_drop_snapshot(trans, dirty->root);
+                        if (ret != -EAGAIN)
+                                break;
+                        mutex_unlock(&root->fs_info->drop_mutex);
+                        err = btrfs_update_root(trans,
+                                        tree_root,
+                                        &dirty->root->root_key,
+                                        &dirty->root->root_item);
+                        if (err)
+                                ret = err;
+                        nr = trans->blocks_used;
+                        ret = btrfs_end_transaction(trans, tree_root);
+                        BUG_ON(ret);
+                        btrfs_btree_balance_dirty(tree_root, nr);
+                        cond_resched();
+                }
+                BUG_ON(ret);
+                atomic_dec(&root->fs_info->throttles);
+                wake_up(&root->fs_info->transaction_throttle);
+                num_bytes -= btrfs_root_used(&dirty->root->root_item);
+                bytes_used = btrfs_root_used(&root->root_item);
+                if (num_bytes) {
+                        btrfs_record_root_in_trans(root);
+                        btrfs_set_root_used(&root->root_item,
+                                            bytes_used - num_bytes);
+                }
+                ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
+                if (ret) {
+                        BUG();
+                        break;
+                }
+                mutex_unlock(&root->fs_info->drop_mutex);
+                spin_lock(&root->list_lock);
+                list_del_init(&dirty->root->dead_list);
+                if (!list_empty(&root->dead_list)) {
+                        struct btrfs_root *oldest;
+                        oldest = list_entry(root->dead_list.prev,
+                                            struct btrfs_root, dead_list);
+                        max_useless = oldest->root_key.offset - 1;
+                } else {
+                        max_useless = root->root_key.offset - 1;
+                }
+                spin_unlock(&root->list_lock);
+                nr = trans->blocks_used;
+                ret = btrfs_end_transaction(trans, tree_root);
+                BUG_ON(ret);
+                ret = btrfs_remove_leaf_refs(root, max_useless, 0);
+                BUG_ON(ret);
+                free_extent_buffer(dirty->root->node);
+                kfree(dirty->root);
+                kfree(dirty);
+                btrfs_btree_balance_dirty(tree_root, nr);
+                cond_resched();
+        }
+        return ret;
+}
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit.  This does the actual creation
+ */
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
+                                   struct btrfs_fs_info *fs_info,
+                                   struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_key key;
+        struct btrfs_root_item *new_root_item;
+        struct btrfs_root *tree_root = fs_info->tree_root;
+        struct btrfs_root *root = pending->root;
+        struct extent_buffer *tmp;
+        struct extent_buffer *old;
+        int ret;
+        u64 objectid;
+        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
+        if (!new_root_item) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+        if (ret)
+                goto fail;
+        btrfs_record_root_in_trans(root);
+        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
+        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+        key.objectid = objectid;
+        key.offset = trans->transid;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        old = btrfs_lock_root_node(root);
+        btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+        btrfs_copy_root(trans, root, old, &tmp, objectid);
+        btrfs_tree_unlock(old);
+        free_extent_buffer(old);
+        btrfs_set_root_bytenr(new_root_item, tmp->start);
+        btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
+        btrfs_set_root_generation(new_root_item, trans->transid);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+                                new_root_item);
+        btrfs_tree_unlock(tmp);
+        free_extent_buffer(tmp);
+        if (ret)
+                goto fail;
+        key.offset = (u64)-1;
+        memcpy(&pending->root_key, &key, sizeof(key));
+fail:
+        kfree(new_root_item);
+        return ret;
+}
+static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_pending_snapshot *pending)
+{
+        int ret;
+        int namelen;
+        u64 index = 0;
+        struct btrfs_trans_handle *trans;
+        struct inode *parent_inode;
+        struct inode *inode;
+        struct btrfs_root *parent_root;
+        parent_inode = pending->dentry->d_parent->d_inode;
+        parent_root = BTRFS_I(parent_inode)->root;
+        trans = btrfs_join_transaction(parent_root, 1);
+        /*
+         * insert the directory item
+         */
+        namelen = strlen(pending->name);
+        ret = btrfs_set_inode_index(parent_inode, &index);
+        ret = btrfs_insert_dir_item(trans, parent_root,
+                            pending->name, namelen,
+                            parent_inode->i_ino,
+                            &pending->root_key, BTRFS_FT_DIR, index);
+        if (ret)
+                goto fail;
+        btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+        ret = btrfs_update_inode(trans, parent_root, parent_inode);
+        BUG_ON(ret);
+        /* add the backref first */
+        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+                                 pending->root_key.objectid,
+                                 BTRFS_ROOT_BACKREF_KEY,
+                                 parent_root->root_key.objectid,
+                                 parent_inode->i_ino, index, pending->name,
+                                 namelen);
+        BUG_ON(ret);
+        /* now add the forward ref */
+        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+                                 parent_root->root_key.objectid,
+                                 BTRFS_ROOT_REF_KEY,
+                                 pending->root_key.objectid,
+                                 parent_inode->i_ino, index, pending->name,
+                                 namelen);
+        inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
+        d_instantiate(pending->dentry, inode);
+fail:
+        btrfs_end_transaction(trans, fs_info->fs_root);
+        return ret;
+}
+/*
+ * create all the snapshots we've scheduled for creation
+ */
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
+                                             struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_pending_snapshot *pending;
+        struct list_head *head = &trans->transaction->pending_snapshots;
+        struct list_head *cur;
+        int ret;
+        list_for_each(cur, head) {
+                pending = list_entry(cur, struct btrfs_pending_snapshot, list);
+                ret = create_pending_snapshot(trans, fs_info, pending);
+                BUG_ON(ret);
+        }
+        return 0;
+}
+static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
+                                             struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_pending_snapshot *pending;
+        struct list_head *head = &trans->transaction->pending_snapshots;
+        int ret;
+        while (!list_empty(head)) {
+                pending = list_entry(head->next,
+                                     struct btrfs_pending_snapshot, list);
+                ret = finish_pending_snapshot(fs_info, pending);
+                BUG_ON(ret);
+                list_del(&pending->list);
+                kfree(pending->name);
+                kfree(pending);
+        }
+        return 0;
+}
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root)
+{
+        unsigned long joined = 0;
+        unsigned long timeout = 1;
+        struct btrfs_transaction *cur_trans;
+        struct btrfs_transaction *prev_trans = NULL;
+        struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+        struct list_head dirty_fs_roots;
+        struct extent_io_tree *pinned_copy;
+        DEFINE_WAIT(wait);
+        int ret;
+        INIT_LIST_HEAD(&dirty_fs_roots);
+        mutex_lock(&root->fs_info->trans_mutex);
+        if (trans->transaction->in_commit) {
+                cur_trans = trans->transaction;
+                trans->transaction->use_count++;
+                mutex_unlock(&root->fs_info->trans_mutex);
+                btrfs_end_transaction(trans, root);
+                ret = wait_for_commit(root, cur_trans);
+                BUG_ON(ret);
+                mutex_lock(&root->fs_info->trans_mutex);
+                put_transaction(cur_trans);
+                mutex_unlock(&root->fs_info->trans_mutex);
+                return 0;
+        }
+        pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
+        if (!pinned_copy)
+                return -ENOMEM;
+        extent_io_tree_init(pinned_copy,
+                             root->fs_info->btree_inode->i_mapping, GFP_NOFS);
+        trans->transaction->in_commit = 1;
+        trans->transaction->blocked = 1;
+        cur_trans = trans->transaction;
+        if (cur_trans->list.prev != &root->fs_info->trans_list) {
+                prev_trans = list_entry(cur_trans->list.prev,
+                                        struct btrfs_transaction, list);
+                if (!prev_trans->commit_done) {
+                        prev_trans->use_count++;
+                        mutex_unlock(&root->fs_info->trans_mutex);
+                        wait_for_commit(root, prev_trans);
+                        mutex_lock(&root->fs_info->trans_mutex);
+                        put_transaction(prev_trans);
+                }
+        }
+        do {
+                int snap_pending = 0;
+                joined = cur_trans->num_joined;
+                if (!list_empty(&trans->transaction->pending_snapshots))
+                        snap_pending = 1;
+                WARN_ON(cur_trans != trans->transaction);
+                prepare_to_wait(&cur_trans->writer_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                if (cur_trans->num_writers > 1)
+                        timeout = MAX_SCHEDULE_TIMEOUT;
+                else
+                        timeout = 1;
+                mutex_unlock(&root->fs_info->trans_mutex);
+                if (snap_pending) {
+                        ret = btrfs_wait_ordered_extents(root, 1);
+                        BUG_ON(ret);
+                }
+                schedule_timeout(timeout);
+                mutex_lock(&root->fs_info->trans_mutex);
+                finish_wait(&cur_trans->writer_wait, &wait);
+        } while (cur_trans->num_writers > 1 ||
+                 (cur_trans->num_joined != joined));
+        ret = create_pending_snapshots(trans, root->fs_info);
+        BUG_ON(ret);
+        WARN_ON(cur_trans != trans->transaction);
+        /* btrfs_commit_tree_roots is responsible for getting the
+         * various roots consistent with each other.  Every pointer
+         * in the tree of tree roots has to point to the most up to date
+         * root for every subvolume and other tree.  So, we have to keep
+         * the tree logging code from jumping in and changing any
+         * of the trees.
+         *
+         * At this point in the commit, there can't be any tree-log
+         * writers, but a little lower down we drop the trans mutex
+         * and let new people in.  By holding the tree_log_mutex
+         * from now until after the super is written, we avoid races
+         * with the tree-log code.
+         */
+        mutex_lock(&root->fs_info->tree_log_mutex);
+        /*
+         * keep tree reloc code from adding new reloc trees
+         */
+        mutex_lock(&root->fs_info->tree_reloc_mutex);
+        ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
+                              &dirty_fs_roots);
+        BUG_ON(ret);
+        /* add_dirty_roots gets rid of all the tree log roots, it is now
+         * safe to free the root of tree log roots
+         */
+        btrfs_free_log_root_tree(trans, root->fs_info);
+        ret = btrfs_commit_tree_roots(trans, root);
+        BUG_ON(ret);
+        cur_trans = root->fs_info->running_transaction;
+        spin_lock(&root->fs_info->new_trans_lock);
+        root->fs_info->running_transaction = NULL;
+        spin_unlock(&root->fs_info->new_trans_lock);
+        btrfs_set_super_generation(&root->fs_info->super_copy,
+                                   cur_trans->transid);
+        btrfs_set_super_root(&root->fs_info->super_copy,
+                             root->fs_info->tree_root->node->start);
+        btrfs_set_super_root_level(&root->fs_info->super_copy,
+                           btrfs_header_level(root->fs_info->tree_root->node));
+        btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+                                   chunk_root->node->start);
+        btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+                                         btrfs_header_level(chunk_root->node));
+        btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
+                                btrfs_header_generation(chunk_root->node));
+        if (!root->fs_info->log_root_recovering) {
+                btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+                btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+        }
+        memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+               sizeof(root->fs_info->super_copy));
+        btrfs_copy_pinned(root, pinned_copy);
+        trans->transaction->blocked = 0;
+        wake_up(&root->fs_info->transaction_throttle);
+        wake_up(&root->fs_info->transaction_wait);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        ret = btrfs_write_and_wait_transaction(trans, root);
+        BUG_ON(ret);
+        write_ctree_super(trans, root, 0);
+        /*
+         * the super is written, we can safely allow the tree-loggers
+         * to go about their business
+         */
+        mutex_unlock(&root->fs_info->tree_log_mutex);
+        btrfs_finish_extent_commit(trans, root, pinned_copy);
+        kfree(pinned_copy);
+        btrfs_drop_dead_reloc_roots(root);
+        mutex_unlock(&root->fs_info->tree_reloc_mutex);
+        /* do the directory inserts of any pending snapshot creations */
+        finish_pending_snapshots(trans, root->fs_info);
+        mutex_lock(&root->fs_info->trans_mutex);
+        cur_trans->commit_done = 1;
+        root->fs_info->last_trans_committed = cur_trans->transid;
+        wake_up(&cur_trans->commit_wait);
+        put_transaction(cur_trans);
+        put_transaction(cur_trans);
+        list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
+        if (root->fs_info->closing)
+                list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        kmem_cache_free(btrfs_trans_handle_cachep, trans);
+        if (root->fs_info->closing)
+                drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
+        return ret;
+}
+/*
+ * interface function to delete all the snapshots we have scheduled for deletion
+ */
+int btrfs_clean_old_snapshots(struct btrfs_root *root)
+{
+        struct list_head dirty_roots;
+        INIT_LIST_HEAD(&dirty_roots);
+again:
+        mutex_lock(&root->fs_info->trans_mutex);
+        list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
+        if (!list_empty(&dirty_roots)) {
+                drop_dirty_roots(root, &dirty_roots);
+                goto again;
+        }
+        return 0;
+}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 000000000000..ea292117f882
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
+#include "btrfs_inode.h"
+struct btrfs_transaction {
+        u64 transid;
+        unsigned long num_writers;
+        unsigned long num_joined;
+        int in_commit;
+        int use_count;
+        int commit_done;
+        int blocked;
+        struct list_head list;
+        struct extent_io_tree dirty_pages;
+        unsigned long start_time;
+        wait_queue_head_t writer_wait;
+        wait_queue_head_t commit_wait;
+        struct list_head pending_snapshots;
+};
+struct btrfs_trans_handle {
+        u64 transid;
+        unsigned long blocks_reserved;
+        unsigned long blocks_used;
+        struct btrfs_transaction *transaction;
+        u64 block_group;
+        u64 alloc_exclude_start;
+        u64 alloc_exclude_nr;
+};
+struct btrfs_pending_snapshot {
+        struct dentry *dentry;
+        struct btrfs_root *root;
+        char *name;
+        struct btrfs_key root_key;
+        struct list_head list;
+};
+struct btrfs_dirty_root {
+        struct list_head list;
+        struct btrfs_root *root;
+        struct btrfs_root *latest_root;
+};
+static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
+                                               struct inode *inode)
+{
+        trans->block_group = BTRFS_I(inode)->block_group;
+}
+static inline void btrfs_update_inode_block_group(
+                                          struct btrfs_trans_handle *trans,
+                                          struct inode *inode)
+{
+        BTRFS_I(inode)->block_group = trans->block_group;
+}
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+                                              struct inode *inode)
+{
+        BTRFS_I(inode)->last_trans = trans->transaction->transid;
+}
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+                                                   int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+                                                   int num_blocks);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+                                                   int num_blocks);
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root);
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root);
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_clean_old_snapshots(struct btrfs_root *root);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root);
+void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages);
+#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 000000000000..3e8358c36165
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+#include "locking.h"
+/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
+ * things from disk, otherwise read all the leaves and try to get key order to
+ * better reflect disk order
+ */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, int cache_only)
+{
+        struct btrfs_path *path = NULL;
+        struct btrfs_key key;
+        int ret = 0;
+        int wret;
+        int level;
+        int orig_level;
+        int is_extent = 0;
+        int next_key_ret = 0;
+        u64 last_ret = 0;
+        u64 min_trans = 0;
+        if (cache_only)
+                goto out;
+        if (root->fs_info->extent_root == root) {
+                /*
+                 * there's recursion here right now in the tree locking,
+                 * we can't defrag the extent root without deadlock
+                 */
+                goto out;
+        }
+        if (root->ref_cows == 0 && !is_extent)
+                goto out;
+        if (btrfs_test_opt(root, SSD))
+                goto out;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        level = btrfs_header_level(root->node);
+        orig_level = level;
+        if (level == 0)
+                goto out;
+        if (root->defrag_progress.objectid == 0) {
+                struct extent_buffer *root_node;
+                u32 nritems;
+                root_node = btrfs_lock_root_node(root);
+                nritems = btrfs_header_nritems(root_node);
+                root->defrag_max.objectid = 0;
+                /* from above we know this is not a leaf */
+                btrfs_node_key_to_cpu(root_node, &root->defrag_max,
+                                      nritems - 1);
+                btrfs_tree_unlock(root_node);
+                free_extent_buffer(root_node);
+                memset(&key, 0, sizeof(key));
+        } else {
+                memcpy(&key, &root->defrag_progress, sizeof(key));
+        }
+        path->keep_locks = 1;
+        if (cache_only)
+                min_trans = root->defrag_trans_start;
+        ret = btrfs_search_forward(root, &key, NULL, path,
+                                   cache_only, min_trans);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = 0;
+                goto out;
+        }
+        btrfs_release_path(root, path);
+        wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+        if (wret < 0) {
+                ret = wret;
+                goto out;
+        }
+        if (!path->nodes[1]) {
+                ret = 0;
+                goto out;
+        }
+        path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+        next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+                                           min_trans);
+        ret = btrfs_realloc_node(trans, root,
+                                 path->nodes[1], 0,
+                                 cache_only, &last_ret,
+                                 &root->defrag_progress);
+        WARN_ON(ret && ret != -EAGAIN);
+        if (next_key_ret == 0) {
+                memcpy(&root->defrag_progress, &key, sizeof(key));
+                ret = -EAGAIN;
+        }
+        btrfs_release_path(root, path);
+        if (is_extent)
+                btrfs_extent_post_op(trans, root);
+out:
+        if (path)
+                btrfs_free_path(path);
+        if (ret == -EAGAIN) {
+                if (root->defrag_max.objectid > root->defrag_progress.objectid)
+                        goto done;
+                if (root->defrag_max.type > root->defrag_progress.type)
+                        goto done;
+                if (root->defrag_max.offset > root->defrag_progress.offset)
+                        goto done;
+                ret = 0;
+        }
+done:
+        if (ret != -EAGAIN) {
+                memset(&root->defrag_progress, 0,
+                       sizeof(root->defrag_progress));
+                root->defrag_trans_start = trans->transid;
+        }
+        return ret;
+}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 000000000000..d81cda2e077c
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2898 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "compat.h"
+#include "tree-log.h"
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+/*
+ * stages for the tree walking.  The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_ALL 2
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, struct inode *inode,
+                             int inode_only);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_path *path, u64 objectid);
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction.  Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree.  Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+/*
+ * btrfs_add_log_tree adds a new per-subvolume log tree into the
+ * tree of log tree roots.  This must be called with a tree log transaction
+ * running (see start_log_trans).
+ */
+static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *root)
+{
+        struct btrfs_key key;
+        struct btrfs_root_item root_item;
+        struct btrfs_inode_item *inode_item;
+        struct extent_buffer *leaf;
+        struct btrfs_root *new_root = root;
+        int ret;
+        u64 objectid = root->root_key.objectid;
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+                                      BTRFS_TREE_LOG_OBJECTID,
+                                      trans->transid, 0, 0, 0);
+        if (IS_ERR(leaf)) {
+                ret = PTR_ERR(leaf);
+                return ret;
+        }
+        btrfs_set_header_nritems(leaf, 0);
+        btrfs_set_header_level(leaf, 0);
+        btrfs_set_header_bytenr(leaf, leaf->start);
+        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+        write_extent_buffer(leaf, root->fs_info->fsid,
+                            (unsigned long)btrfs_header_fsid(leaf),
+                            BTRFS_FSID_SIZE);
+        btrfs_mark_buffer_dirty(leaf);
+        inode_item = &root_item.inode;
+        memset(inode_item, 0, sizeof(*inode_item));
+        inode_item->generation = cpu_to_le64(1);
+        inode_item->size = cpu_to_le64(3);
+        inode_item->nlink = cpu_to_le32(1);
+        inode_item->nbytes = cpu_to_le64(root->leafsize);
+        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+        btrfs_set_root_bytenr(&root_item, leaf->start);
+        btrfs_set_root_generation(&root_item, trans->transid);
+        btrfs_set_root_level(&root_item, 0);
+        btrfs_set_root_refs(&root_item, 0);
+        btrfs_set_root_used(&root_item, 0);
+        memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+        root_item.drop_level = 0;
+        btrfs_tree_unlock(leaf);
+        free_extent_buffer(leaf);
+        leaf = NULL;
+        btrfs_set_root_dirid(&root_item, 0);
+        key.objectid = BTRFS_TREE_LOG_OBJECTID;
+        key.offset = objectid;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
+                                &root_item);
+        if (ret)
+                goto fail;
+        new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
+                                               &key);
+        BUG_ON(!new_root);
+        WARN_ON(root->log_root);
+        root->log_root = new_root;
+        /*
+         * log trees do not get reference counted because they go away
+         * before a real commit is actually done.  They do store pointers
+         * to file data extents, and those reference counts still get
+         * updated (along with back refs to the log tree).
+         */
+        new_root->ref_cows = 0;
+        new_root->last_trans = trans->transid;
+        /*
+         * we need to make sure the root block for this new tree
+         * is marked as dirty in the dirty_log_pages tree.  This
+         * is how it gets flushed down to disk at tree log commit time.
+         *
+         * the tree logging mutex keeps others from coming in and changing
+         * the new_root->node, so we can safely access it here
+         */
+        set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
+                         new_root->node->start + new_root->node->len - 1,
+                         GFP_NOFS);
+fail:
+        return ret;
+}
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root)
+{
+        int ret;
+        mutex_lock(&root->fs_info->tree_log_mutex);
+        if (!root->fs_info->log_root_tree) {
+                ret = btrfs_init_log_root_tree(trans, root->fs_info);
+                BUG_ON(ret);
+        }
+        if (!root->log_root) {
+                ret = btrfs_add_log_tree(trans, root);
+                BUG_ON(ret);
+        }
+        atomic_inc(&root->fs_info->tree_log_writers);
+        root->fs_info->tree_log_batch++;
+        mutex_unlock(&root->fs_info->tree_log_mutex);
+        return 0;
+}
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+        int ret = -ENOENT;
+        smp_mb();
+        if (!root->log_root)
+                return -ENOENT;
+        mutex_lock(&root->fs_info->tree_log_mutex);
+        if (root->log_root) {
+                ret = 0;
+                atomic_inc(&root->fs_info->tree_log_writers);
+                root->fs_info->tree_log_batch++;
+        }
+        mutex_unlock(&root->fs_info->tree_log_mutex);
+        return ret;
+}
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+static int end_log_trans(struct btrfs_root *root)
+{
+        atomic_dec(&root->fs_info->tree_log_writers);
+        smp_mb();
+        if (waitqueue_active(&root->fs_info->tree_log_wait))
+                wake_up(&root->fs_info->tree_log_wait);
+        return 0;
+}
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree.  The stage field tells us which part
+ * of the log tree processing we are currently doing.  The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+        /* should we free the extent on disk when done?  This is used
+         * at transaction commit time while freeing a log tree
+         */
+        int free;
+        /* should we write out the extent buffer?  This is used
+         * while flushing the log tree to disk during a sync
+         */
+        int write;
+        /* should we wait for the extent buffer io to finish?  Also used
+         * while flushing the log tree to disk for a sync
+         */
+        int wait;
+        /* pin only walk, we record which extents on disk belong to the
+         * log trees
+         */
+        int pin;
+        /* what stage of the replay code we're currently in */
+        int stage;
+        /* the root we are currently replaying */
+        struct btrfs_root *replay_dest;
+        /* the trans handle for the current replay */
+        struct btrfs_trans_handle *trans;
+        /* the function that gets used to process blocks we find in the
+         * tree.  Note the extent_buffer might not be up to date when it is
+         * passed in, and it must be checked or read if you need the data
+         * inside it
+         */
+        int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+                            struct walk_control *wc, u64 gen);
+};
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+                              struct extent_buffer *eb,
+                              struct walk_control *wc, u64 gen)
+{
+        if (wc->pin) {
+                mutex_lock(&log->fs_info->pinned_mutex);
+                btrfs_update_pinned_extents(log->fs_info->extent_root,
+                                            eb->start, eb->len, 1);
+                mutex_unlock(&log->fs_info->pinned_mutex);
+        }
+        if (btrfs_buffer_uptodate(eb, gen)) {
+                if (wc->write)
+                        btrfs_write_tree_block(eb);
+                if (wc->wait)
+                        btrfs_wait_tree_block_writeback(eb);
+        }
+        return 0;
+}
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path,
+                                   struct extent_buffer *eb, int slot,
+                                   struct btrfs_key *key)
+{
+        int ret;
+        u32 item_size;
+        u64 saved_i_size = 0;
+        int save_old_i_size = 0;
+        unsigned long src_ptr;
+        unsigned long dst_ptr;
+        int overwrite_root = 0;
+        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+                overwrite_root = 1;
+        item_size = btrfs_item_size_nr(eb, slot);
+        src_ptr = btrfs_item_ptr_offset(eb, slot);
+        /* look for the key in the destination tree */
+        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+        if (ret == 0) {
+                char *src_copy;
+                char *dst_copy;
+                u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+                                                  path->slots[0]);
+                if (dst_size != item_size)
+                        goto insert;
+                if (item_size == 0) {
+                        btrfs_release_path(root, path);
+                        return 0;
+                }
+                dst_copy = kmalloc(item_size, GFP_NOFS);
+                src_copy = kmalloc(item_size, GFP_NOFS);
+                read_extent_buffer(eb, src_copy, src_ptr, item_size);
+                dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+                read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+                                   item_size);
+                ret = memcmp(dst_copy, src_copy, item_size);
+                kfree(dst_copy);
+                kfree(src_copy);
+                /*
+                 * they have the same contents, just return, this saves
+                 * us from cowing blocks in the destination tree and doing
+                 * extra writes that may not have been done by a previous
+                 * sync
+                 */
+                if (ret == 0) {
+                        btrfs_release_path(root, path);
+                        return 0;
+                }
+        }
+insert:
+        btrfs_release_path(root, path);
+        /* try to insert the key into the destination tree */
+        ret = btrfs_insert_empty_item(trans, root, path,
+                                      key, item_size);
+        /* make sure any existing item is the correct size */
+        if (ret == -EEXIST) {
+                u32 found_size;
+                found_size = btrfs_item_size_nr(path->nodes[0],
+                                                path->slots[0]);
+                if (found_size > item_size) {
+                        btrfs_truncate_item(trans, root, path, item_size, 1);
+                } else if (found_size < item_size) {
+                        ret = btrfs_extend_item(trans, root, path,
+                                                item_size - found_size);
+                        BUG_ON(ret);
+                }
+        } else if (ret) {
+                BUG();
+        }
+        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+                                        path->slots[0]);
+        /* don't overwrite an existing inode if the generation number
+         * was logged as zero.  This is done when the tree logging code
+         * is just logging an inode to make sure it exists after recovery.
+         *
+         * Also, don't overwrite i_size on directories during replay.
+         * log replay inserts and removes directory items based on the
+         * state of the tree found in the subvolume, and i_size is modified
+         * as it goes
+         */
+        if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+                struct btrfs_inode_item *src_item;
+                struct btrfs_inode_item *dst_item;
+                src_item = (struct btrfs_inode_item *)src_ptr;
+                dst_item = (struct btrfs_inode_item *)dst_ptr;
+                if (btrfs_inode_generation(eb, src_item) == 0)
+                        goto no_copy;
+                if (overwrite_root &&
+                    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+                    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+                        save_old_i_size = 1;
+                        saved_i_size = btrfs_inode_size(path->nodes[0],
+                                                        dst_item);
+                }
+        }
+        copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+                           src_ptr, item_size);
+        if (save_old_i_size) {
+                struct btrfs_inode_item *dst_item;
+                dst_item = (struct btrfs_inode_item *)dst_ptr;
+                btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+        }
+        /* make sure the generation is filled in */
+        if (key->type == BTRFS_INODE_ITEM_KEY) {
+                struct btrfs_inode_item *dst_item;
+                dst_item = (struct btrfs_inode_item *)dst_ptr;
+                if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+                        btrfs_set_inode_generation(path->nodes[0], dst_item,
+                                                   trans->transid);
+                }
+        }
+no_copy:
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(root, path);
+        return 0;
+}
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+                                             u64 objectid)
+{
+        struct inode *inode;
+        inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+        if (inode->i_state & I_NEW) {
+                BTRFS_I(inode)->root = root;
+                BTRFS_I(inode)->location.objectid = objectid;
+                BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+                BTRFS_I(inode)->location.offset = 0;
+                btrfs_read_locked_inode(inode);
+                unlock_new_inode(inode);
+        }
+        if (is_bad_inode(inode)) {
+                iput(inode);
+                inode = NULL;
+        }
+        return inode;
+}
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'.  path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet.  So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct extent_buffer *eb, int slot,
+                                      struct btrfs_key *key)
+{
+        int found_type;
+        u64 mask = root->sectorsize - 1;
+        u64 extent_end;
+        u64 alloc_hint;
+        u64 start = key->offset;
+        u64 saved_nbytes;
+        struct btrfs_file_extent_item *item;
+        struct inode *inode = NULL;
+        unsigned long size;
+        int ret = 0;
+        item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+        found_type = btrfs_file_extent_type(eb, item);
+        if (found_type == BTRFS_FILE_EXTENT_REG ||
+            found_type == BTRFS_FILE_EXTENT_PREALLOC)
+                extent_end = start + btrfs_file_extent_num_bytes(eb, item);
+        else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                size = btrfs_file_extent_inline_len(eb, item);
+                extent_end = (start + size + mask) & ~mask;
+        } else {
+                ret = 0;
+                goto out;
+        }
+        inode = read_one_inode(root, key->objectid);
+        if (!inode) {
+                ret = -EIO;
+                goto out;
+        }
+        /*
+         * first check to see if we already have this extent in the
+         * file.  This must be done before the btrfs_drop_extents run
+         * so we don't try to drop this extent.
+         */
+        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                       start, 0);
+        if (ret == 0 &&
+            (found_type == BTRFS_FILE_EXTENT_REG ||
+             found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
+                struct btrfs_file_extent_item cmp1;
+                struct btrfs_file_extent_item cmp2;
+                struct btrfs_file_extent_item *existing;
+                struct extent_buffer *leaf;
+                leaf = path->nodes[0];
+                existing = btrfs_item_ptr(leaf, path->slots[0],
+                                          struct btrfs_file_extent_item);
+                read_extent_buffer(eb, &cmp1, (unsigned long)item,
+                                   sizeof(cmp1));
+                read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+                                   sizeof(cmp2));
+                /*
+                 * we already have a pointer to this exact extent,
+                 * we don't have to do anything
+                 */
+                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+                        btrfs_release_path(root, path);
+                        goto out;
+                }
+        }
+        btrfs_release_path(root, path);
+        saved_nbytes = inode_get_bytes(inode);
+        /* drop any overlapping extents */
+        ret = btrfs_drop_extents(trans, root, inode,
+                         start, extent_end, start, &alloc_hint);
+        BUG_ON(ret);
+        if (found_type == BTRFS_FILE_EXTENT_REG ||
+            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                unsigned long dest_offset;
+                struct btrfs_key ins;
+                ret = btrfs_insert_empty_item(trans, root, path, key,
+                                              sizeof(*item));
+                BUG_ON(ret);
+                dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+                                                    path->slots[0]);
+                copy_extent_buffer(path->nodes[0], eb, dest_offset,
+                                (unsigned long)item,  sizeof(*item));
+                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+                ins.type = BTRFS_EXTENT_ITEM_KEY;
+                if (ins.objectid > 0) {
+                        u64 csum_start;
+                        u64 csum_end;
+                        LIST_HEAD(ordered_sums);
+                        /*
+                         * is this extent already allocated in the extent
+                         * allocation tree?  If so, just add a reference
+                         */
+                        ret = btrfs_lookup_extent(root, ins.objectid,
+                                                ins.offset);
+                        if (ret == 0) {
+                                ret = btrfs_inc_extent_ref(trans, root,
+                                                ins.objectid, ins.offset,
+                                                path->nodes[0]->start,
+                                                root->root_key.objectid,
+                                                trans->transid, key->objectid);
+                        } else {
+                                /*
+                                 * insert the extent pointer in the extent
+                                 * allocation tree
+                                 */
+                                ret = btrfs_alloc_logged_extent(trans, root,
+                                                path->nodes[0]->start,
+                                                root->root_key.objectid,
+                                                trans->transid, key->objectid,
+                                                &ins);
+                                BUG_ON(ret);
+                        }
+                        btrfs_release_path(root, path);
+                        if (btrfs_file_extent_compression(eb, item)) {
+                                csum_start = ins.objectid;
+                                csum_end = csum_start + ins.offset;
+                        } else {
+                                csum_start = ins.objectid +
+                                        btrfs_file_extent_offset(eb, item);
+                                csum_end = csum_start +
+                                        btrfs_file_extent_num_bytes(eb, item);
+                        }
+                        ret = btrfs_lookup_csums_range(root->log_root,
+                                                csum_start, csum_end - 1,
+                                                &ordered_sums);
+                        BUG_ON(ret);
+                        while (!list_empty(&ordered_sums)) {
+                                struct btrfs_ordered_sum *sums;
+                                sums = list_entry(ordered_sums.next,
+                                                struct btrfs_ordered_sum,
+                                                list);
+                                ret = btrfs_csum_file_blocks(trans,
+                                                root->fs_info->csum_root,
+                                                sums);
+                                BUG_ON(ret);
+                                list_del(&sums->list);
+                                kfree(sums);
+                        }
+                } else {
+                        btrfs_release_path(root, path);
+                }
+        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+                /* inline extents are easy, we just overwrite them */
+                ret = overwrite_item(trans, root, path, eb, slot, key);
+                BUG_ON(ret);
+        }
+        inode_set_bytes(inode, saved_nbytes);
+        btrfs_update_inode(trans, root, inode);
+out:
+        if (inode)
+                iput(inode);
+        return ret;
+}
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct inode *dir,
+                                      struct btrfs_dir_item *di)
+{
+        struct inode *inode;
+        char *name;
+        int name_len;
+        struct extent_buffer *leaf;
+        struct btrfs_key location;
+        int ret;
+        leaf = path->nodes[0];
+        btrfs_dir_item_key_to_cpu(leaf, di, &location);
+        name_len = btrfs_dir_name_len(leaf, di);
+        name = kmalloc(name_len, GFP_NOFS);
+        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+        btrfs_release_path(root, path);
+        inode = read_one_inode(root, location.objectid);
+        BUG_ON(!inode);
+        ret = link_to_fixup_dir(trans, root, path, location.objectid);
+        BUG_ON(ret);
+        ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+        BUG_ON(ret);
+        kfree(name);
+        iput(inode);
+        return ret;
+}
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 u64 dirid, u64 objectid, u64 index,
+                                 const char *name, int name_len)
+{
+        struct btrfs_dir_item *di;
+        struct btrfs_key location;
+        int match = 0;
+        di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+                                         index, name, name_len, 0);
+        if (di && !IS_ERR(di)) {
+                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+                if (location.objectid != objectid)
+                        goto out;
+        } else
+                goto out;
+        btrfs_release_path(root, path);
+        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+        if (di && !IS_ERR(di)) {
+                btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+                if (location.objectid != objectid)
+                        goto out;
+        } else
+                goto out;
+        match = 1;
+out:
+        btrfs_release_path(root, path);
+        return match;
+}
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode.  This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+                                   struct btrfs_key *key,
+                                   char *name, int namelen)
+{
+        struct btrfs_path *path;
+        struct btrfs_inode_ref *ref;
+        unsigned long ptr;
+        unsigned long ptr_end;
+        unsigned long name_ptr;
+        int found_name_len;
+        int item_size;
+        int ret;
+        int match = 0;
+        path = btrfs_alloc_path();
+        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+        if (ret != 0)
+                goto out;
+        item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+        ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+        ptr_end = ptr + item_size;
+        while (ptr < ptr_end) {
+                ref = (struct btrfs_inode_ref *)ptr;
+                found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+                if (found_name_len == namelen) {
+                        name_ptr = (unsigned long)(ref + 1);
+                        ret = memcmp_extent_buffer(path->nodes[0], name,
+                                                   name_ptr, namelen);
+                        if (ret == 0) {
+                                match = 1;
+                                goto out;
+                        }
+                }
+                ptr = (unsigned long)(ref + 1) + found_name_len;
+        }
+out:
+        btrfs_free_path(path);
+        return match;
+}
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct btrfs_root *log,
+                                  struct btrfs_path *path,
+                                  struct extent_buffer *eb, int slot,
+                                  struct btrfs_key *key)
+{
+        struct inode *dir;
+        int ret;
+        struct btrfs_key location;
+        struct btrfs_inode_ref *ref;
+        struct btrfs_dir_item *di;
+        struct inode *inode;
+        char *name;
+        int namelen;
+        unsigned long ref_ptr;
+        unsigned long ref_end;
+        location.objectid = key->objectid;
+        location.type = BTRFS_INODE_ITEM_KEY;
+        location.offset = 0;
+        /*
+         * it is possible that we didn't log all the parent directories
+         * for a given inode.  If we don't find the dir, just don't
+         * copy the back ref in.  The link count fixup code will take
+         * care of the rest
+         */
+        dir = read_one_inode(root, key->offset);
+        if (!dir)
+                return -ENOENT;
+        inode = read_one_inode(root, key->objectid);
+        BUG_ON(!dir);
+        ref_ptr = btrfs_item_ptr_offset(eb, slot);
+        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+again:
+        ref = (struct btrfs_inode_ref *)ref_ptr;
+        namelen = btrfs_inode_ref_name_len(eb, ref);
+        name = kmalloc(namelen, GFP_NOFS);
+        BUG_ON(!name);
+        read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+        /* if we already have a perfect match, we're done */
+        if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+                         btrfs_inode_ref_index(eb, ref),
+                         name, namelen)) {
+                goto out;
+        }
+        /*
+         * look for a conflicting back reference in the metadata.
+         * if we find one we have to unlink that name of the file
+         * before we add our new link.  Later on, we overwrite any
+         * existing back reference, and we don't want to create
+         * dangling pointers in the directory.
+         */
+conflict_again:
+        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+        if (ret == 0) {
+                char *victim_name;
+                int victim_name_len;
+                struct btrfs_inode_ref *victim_ref;
+                unsigned long ptr;
+                unsigned long ptr_end;
+                struct extent_buffer *leaf = path->nodes[0];
+                /* are we trying to overwrite a back ref for the root directory
+                 * if so, just jump out, we're done
+                 */
+                if (key->objectid == key->offset)
+                        goto out_nowrite;
+                /* check all the names in this back reference to see
+                 * if they are in the log.  if so, we allow them to stay
+                 * otherwise they must be unlinked as a conflict
+                 */
+                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+                while (ptr < ptr_end) {
+                        victim_ref = (struct btrfs_inode_ref *)ptr;
+                        victim_name_len = btrfs_inode_ref_name_len(leaf,
+                                                                   victim_ref);
+                        victim_name = kmalloc(victim_name_len, GFP_NOFS);
+                        BUG_ON(!victim_name);
+                        read_extent_buffer(leaf, victim_name,
+                                           (unsigned long)(victim_ref + 1),
+                                           victim_name_len);
+                        if (!backref_in_log(log, key, victim_name,
+                                            victim_name_len)) {
+                                btrfs_inc_nlink(inode);
+                                btrfs_release_path(root, path);
+                                ret = btrfs_unlink_inode(trans, root, dir,
+                                                         inode, victim_name,
+                                                         victim_name_len);
+                                kfree(victim_name);
+                                btrfs_release_path(root, path);
+                                goto conflict_again;
+                        }
+                        kfree(victim_name);
+                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+                }
+                BUG_ON(ret);
+        }
+        btrfs_release_path(root, path);
+        /* look for a conflicting sequence number */
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+                                         btrfs_inode_ref_index(eb, ref),
+                                         name, namelen, 0);
+        if (di && !IS_ERR(di)) {
+                ret = drop_one_dir_item(trans, root, path, dir, di);
+                BUG_ON(ret);
+        }
+        btrfs_release_path(root, path);
+        /* look for a conflicting name */
+        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                   name, namelen, 0);
+        if (di && !IS_ERR(di)) {
+                ret = drop_one_dir_item(trans, root, path, dir, di);
+                BUG_ON(ret);
+        }
+        btrfs_release_path(root, path);
+        /* insert our name */
+        ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
+                             btrfs_inode_ref_index(eb, ref));
+        BUG_ON(ret);
+        btrfs_update_inode(trans, root, inode);
+out:
+        ref_ptr = (unsigned long)(ref + 1) + namelen;
+        kfree(name);
+        if (ref_ptr < ref_end)
+                goto again;
+        /* finally write the back reference in the inode */
+        ret = overwrite_item(trans, root, path, eb, slot, key);
+        BUG_ON(ret);
+out_nowrite:
+        btrfs_release_path(root, path);
+        iput(dir);
+        iput(inode);
+        return 0;
+}
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+                                           struct btrfs_root *root,
+                                           struct inode *inode)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_key key;
+        u64 nlink = 0;
+        unsigned long ptr;
+        unsigned long ptr_end;
+        int name_len;
+        key.objectid = inode->i_ino;
+        key.type = BTRFS_INODE_REF_KEY;
+        key.offset = (u64)-1;
+        path = btrfs_alloc_path();
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        break;
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &key,
+                                      path->slots[0]);
+                if (key.objectid != inode->i_ino ||
+                    key.type != BTRFS_INODE_REF_KEY)
+                        break;
+                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+                ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+                                                   path->slots[0]);
+                while (ptr < ptr_end) {
+                        struct btrfs_inode_ref *ref;
+                        ref = (struct btrfs_inode_ref *)ptr;
+                        name_len = btrfs_inode_ref_name_len(path->nodes[0],
+                                                            ref);
+                        ptr = (unsigned long)(ref + 1) + name_len;
+                        nlink++;
+                }
+                if (key.offset == 0)
+                        break;
+                key.offset--;
+                btrfs_release_path(root, path);
+        }
+        btrfs_free_path(path);
+        if (nlink != inode->i_nlink) {
+                inode->i_nlink = nlink;
+                btrfs_update_inode(trans, root, inode);
+        }
+        BTRFS_I(inode)->index_cnt = (u64)-1;
+        return 0;
+}
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+                                            struct btrfs_root *root,
+                                            struct btrfs_path *path)
+{
+        int ret;
+        struct btrfs_key key;
+        struct inode *inode;
+        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+        key.type = BTRFS_ORPHAN_ITEM_KEY;
+        key.offset = (u64)-1;
+        while (1) {
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret < 0)
+                        break;
+                if (ret == 1) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+                    key.type != BTRFS_ORPHAN_ITEM_KEY)
+                        break;
+                ret = btrfs_del_item(trans, root, path);
+                BUG_ON(ret);
+                btrfs_release_path(root, path);
+                inode = read_one_inode(root, key.offset);
+                BUG_ON(!inode);
+                ret = fixup_inode_link_count(trans, root, inode);
+                BUG_ON(ret);
+                iput(inode);
+                if (key.offset == 0)
+                        break;
+                key.offset--;
+        }
+        btrfs_release_path(root, path);
+        return 0;
+}
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done.  The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      u64 objectid)
+{
+        struct btrfs_key key;
+        int ret = 0;
+        struct inode *inode;
+        inode = read_one_inode(root, objectid);
+        BUG_ON(!inode);
+        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+        key.offset = objectid;
+        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+        btrfs_release_path(root, path);
+        if (ret == 0) {
+                btrfs_inc_nlink(inode);
+                btrfs_update_inode(trans, root, inode);
+        } else if (ret == -EEXIST) {
+                ret = 0;
+        } else {
+                BUG();
+        }
+        iput(inode);
+        return ret;
+}
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist.  This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_path *path,
+                                    u64 dirid, u64 index,
+                                    char *name, int name_len, u8 type,
+                                    struct btrfs_key *location)
+{
+        struct inode *inode;
+        struct inode *dir;
+        int ret;
+        inode = read_one_inode(root, location->objectid);
+        if (!inode)
+                return -ENOENT;
+        dir = read_one_inode(root, dirid);
+        if (!dir) {
+                iput(inode);
+                return -EIO;
+        }
+        ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+        /* FIXME, put inode into FIXUP list */
+        iput(inode);
+        iput(dir);
+        return ret;
+}
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped.  fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_path *path,
+                                    struct extent_buffer *eb,
+                                    struct btrfs_dir_item *di,
+                                    struct btrfs_key *key)
+{
+        char *name;
+        int name_len;
+        struct btrfs_dir_item *dst_di;
+        struct btrfs_key found_key;
+        struct btrfs_key log_key;
+        struct inode *dir;
+        u8 log_type;
+        int exists;
+        int ret;
+        dir = read_one_inode(root, key->objectid);
+        BUG_ON(!dir);
+        name_len = btrfs_dir_name_len(eb, di);
+        name = kmalloc(name_len, GFP_NOFS);
+        log_type = btrfs_dir_type(eb, di);
+        read_extent_buffer(eb, name, (unsigned long)(di + 1),
+                   name_len);
+        btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+        exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+        if (exists == 0)
+                exists = 1;
+        else
+                exists = 0;
+        btrfs_release_path(root, path);
+        if (key->type == BTRFS_DIR_ITEM_KEY) {
+                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+                                       name, name_len, 1);
+        } else if (key->type == BTRFS_DIR_INDEX_KEY) {
+                dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+                                                     key->objectid,
+                                                     key->offset, name,
+                                                     name_len, 1);
+        } else {
+                BUG();
+        }
+        if (!dst_di || IS_ERR(dst_di)) {
+                /* we need a sequence number to insert, so we only
+                 * do inserts for the BTRFS_DIR_INDEX_KEY types
+                 */
+                if (key->type != BTRFS_DIR_INDEX_KEY)
+                        goto out;
+                goto insert;
+        }
+        btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+        /* the existing item matches the logged item */
+        if (found_key.objectid == log_key.objectid &&
+            found_key.type == log_key.type &&
+            found_key.offset == log_key.offset &&
+            btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+                goto out;
+        }
+        /*
+         * don't drop the conflicting directory entry if the inode
+         * for the new entry doesn't exist
+         */
+        if (!exists)
+                goto out;
+        ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+        BUG_ON(ret);
+        if (key->type == BTRFS_DIR_INDEX_KEY)
+                goto insert;
+out:
+        btrfs_release_path(root, path);
+        kfree(name);
+        iput(dir);
+        return 0;
+insert:
+        btrfs_release_path(root, path);
+        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+                              name, name_len, log_type, &log_key);
+        if (ret && ret != -ENOENT)
+                BUG();
+        goto out;
+}
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_path *path,
+                                        struct extent_buffer *eb, int slot,
+                                        struct btrfs_key *key)
+{
+        int ret;
+        u32 item_size = btrfs_item_size_nr(eb, slot);
+        struct btrfs_dir_item *di;
+        int name_len;
+        unsigned long ptr;
+        unsigned long ptr_end;
+        ptr = btrfs_item_ptr_offset(eb, slot);
+        ptr_end = ptr + item_size;
+        while (ptr < ptr_end) {
+                di = (struct btrfs_dir_item *)ptr;
+                name_len = btrfs_dir_name_len(eb, di);
+                ret = replay_one_name(trans, root, path, eb, di, key);
+                BUG_ON(ret);
+                ptr = (unsigned long)(di + 1);
+                ptr += name_len;
+        }
+        return 0;
+}
+/*
+ * directory replay has two parts.  There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for.  During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+                                   struct btrfs_path *path,
+                                   u64 dirid, int key_type,
+                                   u64 *start_ret, u64 *end_ret)
+{
+        struct btrfs_key key;
+        u64 found_end;
+        struct btrfs_dir_log_item *item;
+        int ret;
+        int nritems;
+        if (*start_ret == (u64)-1)
+                return 1;
+        key.objectid = dirid;
+        key.type = key_type;
+        key.offset = *start_ret;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                if (path->slots[0] == 0)
+                        goto out;
+                path->slots[0]--;
+        }
+        if (ret != 0)
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+        if (key.type != key_type || key.objectid != dirid) {
+                ret = 1;
+                goto next;
+        }
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_dir_log_item);
+        found_end = btrfs_dir_log_end(path->nodes[0], item);
+        if (*start_ret >= key.offset && *start_ret <= found_end) {
+                ret = 0;
+                *start_ret = key.offset;
+                *end_ret = found_end;
+                goto out;
+        }
+        ret = 1;
+next:
+        /* check the next slot in the tree to see if it is a valid item */
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        if (path->slots[0] >= nritems) {
+                ret = btrfs_next_leaf(root, path);
+                if (ret)
+                        goto out;
+        } else {
+                path->slots[0]++;
+        }
+        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+        if (key.type != key_type || key.objectid != dirid) {
+                ret = 1;
+                goto out;
+        }
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_dir_log_item);
+        found_end = btrfs_dir_log_end(path->nodes[0], item);
+        *start_ret = key.offset;
+        *end_ret = found_end;
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        return ret;
+}
+/*
+ * this looks for a given directory item in the log.  If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_root *log,
+                                      struct btrfs_path *path,
+                                      struct btrfs_path *log_path,
+                                      struct inode *dir,
+                                      struct btrfs_key *dir_key)
+{
+        int ret;
+        struct extent_buffer *eb;
+        int slot;
+        u32 item_size;
+        struct btrfs_dir_item *di;
+        struct btrfs_dir_item *log_di;
+        int name_len;
+        unsigned long ptr;
+        unsigned long ptr_end;
+        char *name;
+        struct inode *inode;
+        struct btrfs_key location;
+again:
+        eb = path->nodes[0];
+        slot = path->slots[0];
+        item_size = btrfs_item_size_nr(eb, slot);
+        ptr = btrfs_item_ptr_offset(eb, slot);
+        ptr_end = ptr + item_size;
+        while (ptr < ptr_end) {
+                di = (struct btrfs_dir_item *)ptr;
+                name_len = btrfs_dir_name_len(eb, di);
+                name = kmalloc(name_len, GFP_NOFS);
+                if (!name) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                read_extent_buffer(eb, name, (unsigned long)(di + 1),
+                                  name_len);
+                log_di = NULL;
+                if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
+                                                       dir_key->objectid,
+                                                       name, name_len, 0);
+                } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+                        log_di = btrfs_lookup_dir_index_item(trans, log,
+                                                     log_path,
+                                                     dir_key->objectid,
+                                                     dir_key->offset,
+                                                     name, name_len, 0);
+                }
+                if (!log_di || IS_ERR(log_di)) {
+                        btrfs_dir_item_key_to_cpu(eb, di, &location);
+                        btrfs_release_path(root, path);
+                        btrfs_release_path(log, log_path);
+                        inode = read_one_inode(root, location.objectid);
+                        BUG_ON(!inode);
+                        ret = link_to_fixup_dir(trans, root,
+                                                path, location.objectid);
+                        BUG_ON(ret);
+                        btrfs_inc_nlink(inode);
+                        ret = btrfs_unlink_inode(trans, root, dir, inode,
+                                                 name, name_len);
+                        BUG_ON(ret);
+                        kfree(name);
+                        iput(inode);
+                        /* there might still be more names under this key
+                         * check and repeat if required
+                         */
+                        ret = btrfs_search_slot(NULL, root, dir_key, path,
+                                                0, 0);
+                        if (ret == 0)
+                                goto again;
+                        ret = 0;
+                        goto out;
+                }
+                btrfs_release_path(log, log_path);
+                kfree(name);
+                ptr = (unsigned long)(di + 1);
+                ptr += name_len;
+        }
+        ret = 0;
+out:
+        btrfs_release_path(root, path);
+        btrfs_release_path(log, log_path);
+        return ret;
+}
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes.  It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_root *log,
+                                       struct btrfs_path *path,
+                                       u64 dirid)
+{
+        u64 range_start;
+        u64 range_end;
+        int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+        int ret = 0;
+        struct btrfs_key dir_key;
+        struct btrfs_key found_key;
+        struct btrfs_path *log_path;
+        struct inode *dir;
+        dir_key.objectid = dirid;
+        dir_key.type = BTRFS_DIR_ITEM_KEY;
+        log_path = btrfs_alloc_path();
+        if (!log_path)
+                return -ENOMEM;
+        dir = read_one_inode(root, dirid);
+        /* it isn't an error if the inode isn't there, that can happen
+         * because we replay the deletes before we copy in the inode item
+         * from the log
+         */
+        if (!dir) {
+                btrfs_free_path(log_path);
+                return 0;
+        }
+again:
+        range_start = 0;
+        range_end = 0;
+        while (1) {
+                ret = find_dir_range(log, path, dirid, key_type,
+                                     &range_start, &range_end);
+                if (ret != 0)
+                        break;
+                dir_key.offset = range_start;
+                while (1) {
+                        int nritems;
+                        ret = btrfs_search_slot(NULL, root, &dir_key, path,
+                                                0, 0);
+                        if (ret < 0)
+                                goto out;
+                        nritems = btrfs_header_nritems(path->nodes[0]);
+                        if (path->slots[0] >= nritems) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret)
+                                        break;
+                        }
+                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                              path->slots[0]);
+                        if (found_key.objectid != dirid ||
+                            found_key.type != dir_key.type)
+                                goto next_type;
+                        if (found_key.offset > range_end)
+                                break;
+                        ret = check_item_in_log(trans, root, log, path,
+                                                log_path, dir, &found_key);
+                        BUG_ON(ret);
+                        if (found_key.offset == (u64)-1)
+                                break;
+                        dir_key.offset = found_key.offset + 1;
+                }
+                btrfs_release_path(root, path);
+                if (range_end == (u64)-1)
+                        break;
+                range_start = range_end + 1;
+        }
+next_type:
+        ret = 0;
+        if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+                key_type = BTRFS_DIR_LOG_INDEX_KEY;
+                dir_key.type = BTRFS_DIR_INDEX_KEY;
+                btrfs_release_path(root, path);
+                goto again;
+        }
+out:
+        btrfs_release_path(root, path);
+        btrfs_free_path(log_path);
+        iput(dir);
+        return ret;
+}
+/*
+ * the process_func used to replay items from the log tree.  This
+ * gets called in two different stages.  The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume.  The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+                             struct walk_control *wc, u64 gen)
+{
+        int nritems;
+        struct btrfs_path *path;
+        struct btrfs_root *root = wc->replay_dest;
+        struct btrfs_key key;
+        u32 item_size;
+        int level;
+        int i;
+        int ret;
+        btrfs_read_buffer(eb, gen);
+        level = btrfs_header_level(eb);
+        if (level != 0)
+                return 0;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        nritems = btrfs_header_nritems(eb);
+        for (i = 0; i < nritems; i++) {
+                btrfs_item_key_to_cpu(eb, &key, i);
+                item_size = btrfs_item_size_nr(eb, i);
+                /* inode keys are done during the first stage */
+                if (key.type == BTRFS_INODE_ITEM_KEY &&
+                    wc->stage == LOG_WALK_REPLAY_INODES) {
+                        struct inode *inode;
+                        struct btrfs_inode_item *inode_item;
+                        u32 mode;
+                        inode_item = btrfs_item_ptr(eb, i,
+                                            struct btrfs_inode_item);
+                        mode = btrfs_inode_mode(eb, inode_item);
+                        if (S_ISDIR(mode)) {
+                                ret = replay_dir_deletes(wc->trans,
+                                         root, log, path, key.objectid);
+                                BUG_ON(ret);
+                        }
+                        ret = overwrite_item(wc->trans, root, path,
+                                             eb, i, &key);
+                        BUG_ON(ret);
+                        /* for regular files, truncate away
+                         * extents past the new EOF
+                         */
+                        if (S_ISREG(mode)) {
+                                inode = read_one_inode(root,
+                                                       key.objectid);
+                                BUG_ON(!inode);
+                                ret = btrfs_truncate_inode_items(wc->trans,
+                                        root, inode, inode->i_size,
+                                        BTRFS_EXTENT_DATA_KEY);
+                                BUG_ON(ret);
+                                iput(inode);
+                        }
+                        ret = link_to_fixup_dir(wc->trans, root,
+                                                path, key.objectid);
+                        BUG_ON(ret);
+                }
+                if (wc->stage < LOG_WALK_REPLAY_ALL)
+                        continue;
+                /* these keys are simply copied */
+                if (key.type == BTRFS_XATTR_ITEM_KEY) {
+                        ret = overwrite_item(wc->trans, root, path,
+                                             eb, i, &key);
+                        BUG_ON(ret);
+                } else if (key.type == BTRFS_INODE_REF_KEY) {
+                        ret = add_inode_ref(wc->trans, root, log, path,
+                                            eb, i, &key);
+                        BUG_ON(ret && ret != -ENOENT);
+                } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+                        ret = replay_one_extent(wc->trans, root, path,
+                                                eb, i, &key);
+                        BUG_ON(ret);
+                } else if (key.type == BTRFS_DIR_ITEM_KEY ||
+                           key.type == BTRFS_DIR_INDEX_KEY) {
+                        ret = replay_one_dir_item(wc->trans, root, path,
+                                                  eb, i, &key);
+                        BUG_ON(ret);
+                }
+        }
+        btrfs_free_path(path);
+        return 0;
+}
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct btrfs_path *path, int *level,
+                                   struct walk_control *wc)
+{
+        u64 root_owner;
+        u64 root_gen;
+        u64 bytenr;
+        u64 ptr_gen;
+        struct extent_buffer *next;
+        struct extent_buffer *cur;
+        struct extent_buffer *parent;
+        u32 blocksize;
+        int ret = 0;
+        WARN_ON(*level < 0);
+        WARN_ON(*level >= BTRFS_MAX_LEVEL);
+        while (*level > 0) {
+                WARN_ON(*level < 0);
+                WARN_ON(*level >= BTRFS_MAX_LEVEL);
+                cur = path->nodes[*level];
+                if (btrfs_header_level(cur) != *level)
+                        WARN_ON(1);
+                if (path->slots[*level] >=
+                    btrfs_header_nritems(cur))
+                        break;
+                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+                blocksize = btrfs_level_size(root, *level - 1);
+                parent = path->nodes[*level];
+                root_owner = btrfs_header_owner(parent);
+                root_gen = btrfs_header_generation(parent);
+                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+                wc->process_func(root, next, wc, ptr_gen);
+                if (*level == 1) {
+                        path->slots[*level]++;
+                        if (wc->free) {
+                                btrfs_read_buffer(next, ptr_gen);
+                                btrfs_tree_lock(next);
+                                clean_tree_block(trans, root, next);
+                                btrfs_wait_tree_block_writeback(next);
+                                btrfs_tree_unlock(next);
+                                ret = btrfs_drop_leaf_ref(trans, root, next);
+                                BUG_ON(ret);
+                                WARN_ON(root_owner !=
+                                        BTRFS_TREE_LOG_OBJECTID);
+                                ret = btrfs_free_reserved_extent(root,
+                                                         bytenr, blocksize);
+                                BUG_ON(ret);
+                        }
+                        free_extent_buffer(next);
+                        continue;
+                }
+                btrfs_read_buffer(next, ptr_gen);
+                WARN_ON(*level <= 0);
+                if (path->nodes[*level-1])
+                        free_extent_buffer(path->nodes[*level-1]);
+                path->nodes[*level-1] = next;
+                *level = btrfs_header_level(next);
+                path->slots[*level] = 0;
+                cond_resched();
+        }
+        WARN_ON(*level < 0);
+        WARN_ON(*level >= BTRFS_MAX_LEVEL);
+        if (path->nodes[*level] == root->node)
+                parent = path->nodes[*level];
+        else
+                parent = path->nodes[*level + 1];
+        bytenr = path->nodes[*level]->start;
+        blocksize = btrfs_level_size(root, *level);
+        root_owner = btrfs_header_owner(parent);
+        root_gen = btrfs_header_generation(parent);
+        wc->process_func(root, path->nodes[*level], wc,
+                         btrfs_header_generation(path->nodes[*level]));
+        if (wc->free) {
+                next = path->nodes[*level];
+                btrfs_tree_lock(next);
+                clean_tree_block(trans, root, next);
+                btrfs_wait_tree_block_writeback(next);
+                btrfs_tree_unlock(next);
+                if (*level == 0) {
+                        ret = btrfs_drop_leaf_ref(trans, root, next);
+                        BUG_ON(ret);
+                }
+                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+                ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
+                BUG_ON(ret);
+        }
+        free_extent_buffer(path->nodes[*level]);
+        path->nodes[*level] = NULL;
+        *level += 1;
+        cond_resched();
+        return 0;
+}
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path, int *level,
+                                 struct walk_control *wc)
+{
+        u64 root_owner;
+        u64 root_gen;
+        int i;
+        int slot;
+        int ret;
+        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+                slot = path->slots[i];
+                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                        struct extent_buffer *node;
+                        node = path->nodes[i];
+                        path->slots[i]++;
+                        *level = i;
+                        WARN_ON(*level == 0);
+                        return 0;
+                } else {
+                        struct extent_buffer *parent;
+                        if (path->nodes[*level] == root->node)
+                                parent = path->nodes[*level];
+                        else
+                                parent = path->nodes[*level + 1];
+                        root_owner = btrfs_header_owner(parent);
+                        root_gen = btrfs_header_generation(parent);
+                        wc->process_func(root, path->nodes[*level], wc,
+                                 btrfs_header_generation(path->nodes[*level]));
+                        if (wc->free) {
+                                struct extent_buffer *next;
+                                next = path->nodes[*level];
+                                btrfs_tree_lock(next);
+                                clean_tree_block(trans, root, next);
+                                btrfs_wait_tree_block_writeback(next);
+                                btrfs_tree_unlock(next);
+                                if (*level == 0) {
+                                        ret = btrfs_drop_leaf_ref(trans, root,
+                                                                  next);
+                                        BUG_ON(ret);
+                                }
+                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+                                ret = btrfs_free_reserved_extent(root,
+                                                path->nodes[*level]->start,
+                                                path->nodes[*level]->len);
+                                BUG_ON(ret);
+                        }
+                        free_extent_buffer(path->nodes[*level]);
+                        path->nodes[*level] = NULL;
+                        *level = i + 1;
+                }
+        }
+        return 1;
+}
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *log, struct walk_control *wc)
+{
+        int ret = 0;
+        int wret;
+        int level;
+        struct btrfs_path *path;
+        int i;
+        int orig_level;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        level = btrfs_header_level(log->node);
+        orig_level = level;
+        path->nodes[level] = log->node;
+        extent_buffer_get(log->node);
+        path->slots[level] = 0;
+        while (1) {
+                wret = walk_down_log_tree(trans, log, path, &level, wc);
+                if (wret > 0)
+                        break;
+                if (wret < 0)
+                        ret = wret;
+                wret = walk_up_log_tree(trans, log, path, &level, wc);
+                if (wret > 0)
+                        break;
+                if (wret < 0)
+                        ret = wret;
+        }
+        /* was the root node processed? if not, catch it here */
+        if (path->nodes[orig_level]) {
+                wc->process_func(log, path->nodes[orig_level], wc,
+                         btrfs_header_generation(path->nodes[orig_level]));
+                if (wc->free) {
+                        struct extent_buffer *next;
+                        next = path->nodes[orig_level];
+                        btrfs_tree_lock(next);
+                        clean_tree_block(trans, log, next);
+                        btrfs_wait_tree_block_writeback(next);
+                        btrfs_tree_unlock(next);
+                        if (orig_level == 0) {
+                                ret = btrfs_drop_leaf_ref(trans, log,
+                                                          next);
+                                BUG_ON(ret);
+                        }
+                        WARN_ON(log->root_key.objectid !=
+                                BTRFS_TREE_LOG_OBJECTID);
+                        ret = btrfs_free_reserved_extent(log, next->start,
+                                                         next->len);
+                        BUG_ON(ret);
+                }
+        }
+        for (i = 0; i <= orig_level; i++) {
+                if (path->nodes[i]) {
+                        free_extent_buffer(path->nodes[i]);
+                        path->nodes[i] = NULL;
+                }
+        }
+        btrfs_free_path(path);
+        if (wc->free)
+                free_extent_buffer(log->node);
+        return ret;
+}
+static int wait_log_commit(struct btrfs_root *log)
+{
+        DEFINE_WAIT(wait);
+        u64 transid = log->fs_info->tree_log_transid;
+        do {
+                prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                mutex_unlock(&log->fs_info->tree_log_mutex);
+                if (atomic_read(&log->fs_info->tree_log_commit))
+                        schedule();
+                finish_wait(&log->fs_info->tree_log_wait, &wait);
+                mutex_lock(&log->fs_info->tree_log_mutex);
+        } while (transid == log->fs_info->tree_log_transid &&
+                atomic_read(&log->fs_info->tree_log_commit));
+        return 0;
+}
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it.  When this call is done,
+ * you know that any inodes previously logged are safely on disk
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+                   struct btrfs_root *root)
+{
+        int ret;
+        unsigned long batch;
+        struct btrfs_root *log = root->log_root;
+        mutex_lock(&log->fs_info->tree_log_mutex);
+        if (atomic_read(&log->fs_info->tree_log_commit)) {
+                wait_log_commit(log);
+                goto out;
+        }
+        atomic_set(&log->fs_info->tree_log_commit, 1);
+        while (1) {
+                batch = log->fs_info->tree_log_batch;
+                mutex_unlock(&log->fs_info->tree_log_mutex);
+                schedule_timeout_uninterruptible(1);
+                mutex_lock(&log->fs_info->tree_log_mutex);
+                while (atomic_read(&log->fs_info->tree_log_writers)) {
+                        DEFINE_WAIT(wait);
+                        prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        mutex_unlock(&log->fs_info->tree_log_mutex);
+                        if (atomic_read(&log->fs_info->tree_log_writers))
+                                schedule();
+                        mutex_lock(&log->fs_info->tree_log_mutex);
+                        finish_wait(&log->fs_info->tree_log_wait, &wait);
+                }
+                if (batch == log->fs_info->tree_log_batch)
+                        break;
+        }
+        ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
+        BUG_ON(ret);
+        ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
+                               &root->fs_info->log_root_tree->dirty_log_pages);
+        BUG_ON(ret);
+        btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+                                 log->fs_info->log_root_tree->node->start);
+        btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+                       btrfs_header_level(log->fs_info->log_root_tree->node));
+        write_ctree_super(trans, log->fs_info->tree_root, 2);
+        log->fs_info->tree_log_transid++;
+        log->fs_info->tree_log_batch = 0;
+        atomic_set(&log->fs_info->tree_log_commit, 0);
+        smp_mb();
+        if (waitqueue_active(&log->fs_info->tree_log_wait))
+                wake_up(&log->fs_info->tree_log_wait);
+out:
+        mutex_unlock(&log->fs_info->tree_log_mutex);
+        return 0;
+}
+/* * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+        int ret;
+        struct btrfs_root *log;
+        struct key;
+        u64 start;
+        u64 end;
+        struct walk_control wc = {
+                .free = 1,
+                .process_func = process_one_buffer
+        };
+        if (!root->log_root || root->fs_info->log_root_recovering)
+                return 0;
+        log = root->log_root;
+        ret = walk_log_tree(trans, log, &wc);
+        BUG_ON(ret);
+        while (1) {
+                ret = find_first_extent_bit(&log->dirty_log_pages,
+                                    0, &start, &end, EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_dirty(&log->dirty_log_pages,
+                                   start, end, GFP_NOFS);
+        }
+        log = root->log_root;
+        ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+                             &log->root_key);
+        BUG_ON(ret);
+        root->log_root = NULL;
+        kfree(root->log_root);
+        return 0;
+}
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *log)
+{
+        u64 bytenr = btrfs_root_bytenr(&log->root_item);
+        int ret;
+        if (log->node->start == bytenr)
+                return 0;
+        btrfs_set_root_bytenr(&log->root_item, log->node->start);
+        btrfs_set_root_generation(&log->root_item, trans->transid);
+        btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+        ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+                                &log->root_key, &log->root_item);
+        BUG_ON(ret);
+        return ret;
+}
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist.  But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked.  Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 const char *name, int name_len,
+                                 struct inode *dir, u64 index)
+{
+        struct btrfs_root *log;
+        struct btrfs_dir_item *di;
+        struct btrfs_path *path;
+        int ret;
+        int bytes_del = 0;
+        if (BTRFS_I(dir)->logged_trans < trans->transid)
+                return 0;
+        ret = join_running_log_trans(root);
+        if (ret)
+                return 0;
+        mutex_lock(&BTRFS_I(dir)->log_mutex);
+        log = root->log_root;
+        path = btrfs_alloc_path();
+        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+                                   name, name_len, -1);
+        if (di && !IS_ERR(di)) {
+                ret = btrfs_delete_one_dir_name(trans, log, path, di);
+                bytes_del += name_len;
+                BUG_ON(ret);
+        }
+        btrfs_release_path(log, path);
+        di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+                                         index, name, name_len, -1);
+        if (di && !IS_ERR(di)) {
+                ret = btrfs_delete_one_dir_name(trans, log, path, di);
+                bytes_del += name_len;
+                BUG_ON(ret);
+        }
+        /* update the directory size in the log to reflect the names
+         * we have removed
+         */
+        if (bytes_del) {
+                struct btrfs_key key;
+                key.objectid = dir->i_ino;
+                key.offset = 0;
+                key.type = BTRFS_INODE_ITEM_KEY;
+                btrfs_release_path(log, path);
+                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+                if (ret == 0) {
+                        struct btrfs_inode_item *item;
+                        u64 i_size;
+                        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                              struct btrfs_inode_item);
+                        i_size = btrfs_inode_size(path->nodes[0], item);
+                        if (i_size > bytes_del)
+                                i_size -= bytes_del;
+                        else
+                                i_size = 0;
+                        btrfs_set_inode_size(path->nodes[0], item, i_size);
+                        btrfs_mark_buffer_dirty(path->nodes[0]);
+                } else
+                        ret = 0;
+                btrfs_release_path(log, path);
+        }
+        btrfs_free_path(path);
+        mutex_unlock(&BTRFS_I(dir)->log_mutex);
+        end_log_trans(root);
+        return 0;
+}
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               const char *name, int name_len,
+                               struct inode *inode, u64 dirid)
+{
+        struct btrfs_root *log;
+        u64 index;
+        int ret;
+        if (BTRFS_I(inode)->logged_trans < trans->transid)
+                return 0;
+        ret = join_running_log_trans(root);
+        if (ret)
+                return 0;
+        log = root->log_root;
+        mutex_lock(&BTRFS_I(inode)->log_mutex);
+        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+                                  dirid, &index);
+        mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        end_log_trans(root);
+        return ret;
+}
+/*
+ * creates a range item in the log for 'dirid'.  first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *log,
+                                       struct btrfs_path *path,
+                                       int key_type, u64 dirid,
+                                       u64 first_offset, u64 last_offset)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_dir_log_item *item;
+        key.objectid = dirid;
+        key.offset = first_offset;
+        if (key_type == BTRFS_DIR_ITEM_KEY)
+                key.type = BTRFS_DIR_LOG_ITEM_KEY;
+        else
+                key.type = BTRFS_DIR_LOG_INDEX_KEY;
+        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+        BUG_ON(ret);
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_dir_log_item);
+        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(log, path);
+        return 0;
+}
+/*
+ * log all the items included in the current transaction for a given
+ * directory.  This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct inode *inode,
+                          struct btrfs_path *path,
+                          struct btrfs_path *dst_path, int key_type,
+                          u64 min_offset, u64 *last_offset_ret)
+{
+        struct btrfs_key min_key;
+        struct btrfs_key max_key;
+        struct btrfs_root *log = root->log_root;
+        struct extent_buffer *src;
+        int ret;
+        int i;
+        int nritems;
+        u64 first_offset = min_offset;
+        u64 last_offset = (u64)-1;
+        log = root->log_root;
+        max_key.objectid = inode->i_ino;
+        max_key.offset = (u64)-1;
+        max_key.type = key_type;
+        min_key.objectid = inode->i_ino;
+        min_key.type = key_type;
+        min_key.offset = min_offset;
+        path->keep_locks = 1;
+        ret = btrfs_search_forward(root, &min_key, &max_key,
+                                   path, 0, trans->transid);
+        /*
+         * we didn't find anything from this transaction, see if there
+         * is anything at all
+         */
+        if (ret != 0 || min_key.objectid != inode->i_ino ||
+            min_key.type != key_type) {
+                min_key.objectid = inode->i_ino;
+                min_key.type = key_type;
+                min_key.offset = (u64)-1;
+                btrfs_release_path(root, path);
+                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+                if (ret < 0) {
+                        btrfs_release_path(root, path);
+                        return ret;
+                }
+                ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+                /* if ret == 0 there are items for this type,
+                 * create a range to tell us the last key of this type.
+                 * otherwise, there are no items in this directory after
+                 * *min_offset, and we create a range to indicate that.
+                 */
+                if (ret == 0) {
+                        struct btrfs_key tmp;
+                        btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+                                              path->slots[0]);
+                        if (key_type == tmp.type)
+                                first_offset = max(min_offset, tmp.offset) + 1;
+                }
+                goto done;
+        }
+        /* go backward to find any previous key */
+        ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+        if (ret == 0) {
+                struct btrfs_key tmp;
+                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+                if (key_type == tmp.type) {
+                        first_offset = tmp.offset;
+                        ret = overwrite_item(trans, log, dst_path,
+                                             path->nodes[0], path->slots[0],
+                                             &tmp);
+                }
+        }
+        btrfs_release_path(root, path);
+        /* find the first key from this transaction again */
+        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+        if (ret != 0) {
+                WARN_ON(1);
+                goto done;
+        }
+        /*
+         * we have a block from this transaction, log every item in it
+         * from our directory
+         */
+        while (1) {
+                struct btrfs_key tmp;
+                src = path->nodes[0];
+                nritems = btrfs_header_nritems(src);
+                for (i = path->slots[0]; i < nritems; i++) {
+                        btrfs_item_key_to_cpu(src, &min_key, i);
+                        if (min_key.objectid != inode->i_ino ||
+                            min_key.type != key_type)
+                                goto done;
+                        ret = overwrite_item(trans, log, dst_path, src, i,
+                                             &min_key);
+                        BUG_ON(ret);
+                }
+                path->slots[0] = nritems;
+                /*
+                 * look ahead to the next item and see if it is also
+                 * from this directory and from this transaction
+                 */
+                ret = btrfs_next_leaf(root, path);
+                if (ret == 1) {
+                        last_offset = (u64)-1;
+                        goto done;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+                if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+                        last_offset = (u64)-1;
+                        goto done;
+                }
+                if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+                        ret = overwrite_item(trans, log, dst_path,
+                                             path->nodes[0], path->slots[0],
+                                             &tmp);
+                        BUG_ON(ret);
+                        last_offset = tmp.offset;
+                        goto done;
+                }
+        }
+done:
+        *last_offset_ret = last_offset;
+        btrfs_release_path(root, path);
+        btrfs_release_path(log, dst_path);
+        /* insert the log range keys to indicate where the log is valid */
+        ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+                                 first_offset, last_offset);
+        BUG_ON(ret);
+        return 0;
+}
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct inode *inode,
+                          struct btrfs_path *path,
+                          struct btrfs_path *dst_path)
+{
+        u64 min_key;
+        u64 max_key;
+        int ret;
+        int key_type = BTRFS_DIR_ITEM_KEY;
+again:
+        min_key = 0;
+        max_key = 0;
+        while (1) {
+                ret = log_dir_items(trans, root, inode, path,
+                                    dst_path, key_type, min_key,
+                                    &max_key);
+                BUG_ON(ret);
+                if (max_key == (u64)-1)
+                        break;
+                min_key = max_key + 1;
+        }
+        if (key_type == BTRFS_DIR_ITEM_KEY) {
+                key_type = BTRFS_DIR_INDEX_KEY;
+                goto again;
+        }
+        return 0;
+}
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode.  max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *log,
+                                  struct btrfs_path *path,
+                                  u64 objectid, int max_key_type)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        key.objectid = objectid;
+        key.type = max_key_type;
+        key.offset = (u64)-1;
+        while (1) {
+                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+                if (ret != 1)
+                        break;
+                if (path->slots[0] == 0)
+                        break;
+                path->slots[0]--;
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                if (found_key.objectid != objectid)
+                        break;
+                ret = btrfs_del_item(trans, log, path);
+                BUG_ON(ret);
+                btrfs_release_path(log, path);
+        }
+        btrfs_release_path(log, path);
+        return 0;
+}
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *log,
+                               struct btrfs_path *dst_path,
+                               struct extent_buffer *src,
+                               int start_slot, int nr, int inode_only)
+{
+        unsigned long src_offset;
+        unsigned long dst_offset;
+        struct btrfs_file_extent_item *extent;
+        struct btrfs_inode_item *inode_item;
+        int ret;
+        struct btrfs_key *ins_keys;
+        u32 *ins_sizes;
+        char *ins_data;
+        int i;
+        struct list_head ordered_sums;
+        INIT_LIST_HEAD(&ordered_sums);
+        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+                           nr * sizeof(u32), GFP_NOFS);
+        ins_sizes = (u32 *)ins_data;
+        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+        for (i = 0; i < nr; i++) {
+                ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+                btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+        }
+        ret = btrfs_insert_empty_items(trans, log, dst_path,
+                                       ins_keys, ins_sizes, nr);
+        BUG_ON(ret);
+        for (i = 0; i < nr; i++) {
+                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+                                                   dst_path->slots[0]);
+                src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+                copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+                                   src_offset, ins_sizes[i]);
+                if (inode_only == LOG_INODE_EXISTS &&
+                    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+                        inode_item = btrfs_item_ptr(dst_path->nodes[0],
+                                                    dst_path->slots[0],
+                                                    struct btrfs_inode_item);
+                        btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+                        /* set the generation to zero so the recover code
+                         * can tell the difference between an logging
+                         * just to say 'this inode exists' and a logging
+                         * to say 'update this inode with these values'
+                         */
+                        btrfs_set_inode_generation(dst_path->nodes[0],
+                                                   inode_item, 0);
+                }
+                /* take a reference on file data extents so that truncates
+                 * or deletes of this inode don't have to relog the inode
+                 * again
+                 */
+                if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+                        int found_type;
+                        extent = btrfs_item_ptr(src, start_slot + i,
+                                                struct btrfs_file_extent_item);
+                        found_type = btrfs_file_extent_type(src, extent);
+                        if (found_type == BTRFS_FILE_EXTENT_REG ||
+                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                u64 ds = btrfs_file_extent_disk_bytenr(src,
+                                                                   extent);
+                                u64 dl = btrfs_file_extent_disk_num_bytes(src,
+                                                                      extent);
+                                u64 cs = btrfs_file_extent_offset(src, extent);
+                                u64 cl = btrfs_file_extent_num_bytes(src,
+                                                                     extent);;
+                                if (btrfs_file_extent_compression(src,
+                                                                  extent)) {
+                                        cs = 0;
+                                        cl = dl;
+                                }
+                                /* ds == 0 is a hole */
+                                if (ds != 0) {
+                                        ret = btrfs_inc_extent_ref(trans, log,
+                                                   ds, dl,
+                                                   dst_path->nodes[0]->start,
+                                                   BTRFS_TREE_LOG_OBJECTID,
+                                                   trans->transid,
+                                                   ins_keys[i].objectid);
+                                        BUG_ON(ret);
+                                        ret = btrfs_lookup_csums_range(
+                                                   log->fs_info->csum_root,
+                                                   ds + cs, ds + cs + cl - 1,
+                                                   &ordered_sums);
+                                        BUG_ON(ret);
+                                }
+                        }
+                }
+                dst_path->slots[0]++;
+        }
+        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+        btrfs_release_path(log, dst_path);
+        kfree(ins_data);
+        /*
+         * we have to do this after the loop above to avoid changing the
+         * log tree while trying to change the log tree.
+         */
+        while (!list_empty(&ordered_sums)) {
+                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+                                                   struct btrfs_ordered_sum,
+                                                   list);
+                ret = btrfs_csum_file_blocks(trans, log, sums);
+                BUG_ON(ret);
+                list_del(&sums->list);
+                kfree(sums);
+        }
+        return 0;
+}
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree.  An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, struct inode *inode,
+                             int inode_only)
+{
+        struct btrfs_path *path;
+        struct btrfs_path *dst_path;
+        struct btrfs_key min_key;
+        struct btrfs_key max_key;
+        struct btrfs_root *log = root->log_root;
+        struct extent_buffer *src = NULL;
+        u32 size;
+        int ret;
+        int nritems;
+        int ins_start_slot = 0;
+        int ins_nr;
+        log = root->log_root;
+        path = btrfs_alloc_path();
+        dst_path = btrfs_alloc_path();
+        min_key.objectid = inode->i_ino;
+        min_key.type = BTRFS_INODE_ITEM_KEY;
+        min_key.offset = 0;
+        max_key.objectid = inode->i_ino;
+        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+                max_key.type = BTRFS_XATTR_ITEM_KEY;
+        else
+                max_key.type = (u8)-1;
+        max_key.offset = (u64)-1;
+        /*
+         * if this inode has already been logged and we're in inode_only
+         * mode, we don't want to delete the things that have already
+         * been written to the log.
+         *
+         * But, if the inode has been through an inode_only log,
+         * the logged_trans field is not set.  This allows us to catch
+         * any new names for this inode in the backrefs by logging it
+         * again
+         */
+        if (inode_only == LOG_INODE_EXISTS &&
+            BTRFS_I(inode)->logged_trans == trans->transid) {
+                btrfs_free_path(path);
+                btrfs_free_path(dst_path);
+                goto out;
+        }
+        mutex_lock(&BTRFS_I(inode)->log_mutex);
+        /*
+         * a brute force approach to making sure we get the most uptodate
+         * copies of everything.
+         */
+        if (S_ISDIR(inode->i_mode)) {
+                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+                if (inode_only == LOG_INODE_EXISTS)
+                        max_key_type = BTRFS_XATTR_ITEM_KEY;
+                ret = drop_objectid_items(trans, log, path,
+                                          inode->i_ino, max_key_type);
+        } else {
+                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+        }
+        BUG_ON(ret);
+        path->keep_locks = 1;
+        while (1) {
+                ins_nr = 0;
+                ret = btrfs_search_forward(root, &min_key, &max_key,
+                                           path, 0, trans->transid);
+                if (ret != 0)
+                        break;
+again:
+                /* note, ins_nr might be > 0 here, cleanup outside the loop */
+                if (min_key.objectid != inode->i_ino)
+                        break;
+                if (min_key.type > max_key.type)
+                        break;
+                src = path->nodes[0];
+                size = btrfs_item_size_nr(src, path->slots[0]);
+                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+                        ins_nr++;
+                        goto next_slot;
+                } else if (!ins_nr) {
+                        ins_start_slot = path->slots[0];
+                        ins_nr = 1;
+                        goto next_slot;
+                }
+                ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+                                 ins_nr, inode_only);
+                BUG_ON(ret);
+                ins_nr = 1;
+                ins_start_slot = path->slots[0];
+next_slot:
+                nritems = btrfs_header_nritems(path->nodes[0]);
+                path->slots[0]++;
+                if (path->slots[0] < nritems) {
+                        btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+                                              path->slots[0]);
+                        goto again;
+                }
+                if (ins_nr) {
+                        ret = copy_items(trans, log, dst_path, src,
+                                         ins_start_slot,
+                                         ins_nr, inode_only);
+                        BUG_ON(ret);
+                        ins_nr = 0;
+                }
+                btrfs_release_path(root, path);
+                if (min_key.offset < (u64)-1)
+                        min_key.offset++;
+                else if (min_key.type < (u8)-1)
+                        min_key.type++;
+                else if (min_key.objectid < (u64)-1)
+                        min_key.objectid++;
+                else
+                        break;
+        }
+        if (ins_nr) {
+                ret = copy_items(trans, log, dst_path, src,
+                                 ins_start_slot,
+                                 ins_nr, inode_only);
+                BUG_ON(ret);
+                ins_nr = 0;
+        }
+        WARN_ON(ins_nr);
+        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+                btrfs_release_path(root, path);
+                btrfs_release_path(log, dst_path);
+                BTRFS_I(inode)->log_dirty_trans = 0;
+                ret = log_directory_changes(trans, root, inode, path, dst_path);
+                BUG_ON(ret);
+        }
+        BTRFS_I(inode)->logged_trans = trans->transid;
+        mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        btrfs_free_path(path);
+        btrfs_free_path(dst_path);
+        mutex_lock(&root->fs_info->tree_log_mutex);
+        ret = update_log_root(trans, log);
+        BUG_ON(ret);
+        mutex_unlock(&root->fs_info->tree_log_mutex);
+out:
+        return 0;
+}
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct inode *inode,
+                    int inode_only)
+{
+        int ret;
+        start_log_trans(trans, root);
+        ret = __btrfs_log_inode(trans, root, inode, inode_only);
+        end_log_trans(root);
+        return ret;
+}
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log.  A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct dentry *dentry)
+{
+        int inode_only = LOG_INODE_ALL;
+        struct super_block *sb;
+        int ret;
+        start_log_trans(trans, root);
+        sb = dentry->d_inode->i_sb;
+        while (1) {
+                ret = __btrfs_log_inode(trans, root, dentry->d_inode,
+                                        inode_only);
+                BUG_ON(ret);
+                inode_only = LOG_INODE_EXISTS;
+                dentry = dentry->d_parent;
+                if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+                        break;
+                if (BTRFS_I(dentry->d_inode)->generation <=
+                    root->fs_info->last_trans_committed)
+                        break;
+        }
+        end_log_trans(root);
+        return 0;
+}
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct dentry *dentry)
+{
+        u64 gen;
+        gen = root->fs_info->last_trans_new_blockgroup;
+        if (gen > root->fs_info->last_trans_committed)
+                return 1;
+        else
+                return btrfs_log_dentry(trans, root, dentry);
+}
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_key tmp_key;
+        struct btrfs_root *log;
+        struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+        u64 highest_inode;
+        struct walk_control wc = {
+                .process_func = process_one_buffer,
+                .stage = 0,
+        };
+        fs_info->log_root_recovering = 1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        trans = btrfs_start_transaction(fs_info->tree_root, 1);
+        wc.trans = trans;
+        wc.pin = 1;
+        walk_log_tree(trans, log_root_tree, &wc);
+again:
+        key.objectid = BTRFS_TREE_LOG_OBJECTID;
+        key.offset = (u64)-1;
+        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+        while (1) {
+                ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+                if (ret < 0)
+                        break;
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                btrfs_release_path(log_root_tree, path);
+                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+                        break;
+                log = btrfs_read_fs_root_no_radix(log_root_tree,
+                                                  &found_key);
+                BUG_ON(!log);
+                tmp_key.objectid = found_key.offset;
+                tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+                tmp_key.offset = (u64)-1;
+                wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+                BUG_ON(!wc.replay_dest);
+                wc.replay_dest->log_root = log;
+                btrfs_record_root_in_trans(wc.replay_dest);
+                ret = walk_log_tree(trans, log, &wc);
+                BUG_ON(ret);
+                if (wc.stage == LOG_WALK_REPLAY_ALL) {
+                        ret = fixup_inode_link_counts(trans, wc.replay_dest,
+                                                      path);
+                        BUG_ON(ret);
+                }
+                ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
+                if (ret == 0) {
+                        wc.replay_dest->highest_inode = highest_inode;
+                        wc.replay_dest->last_inode_alloc = highest_inode;
+                }
+                key.offset = found_key.offset - 1;
+                wc.replay_dest->log_root = NULL;
+                free_extent_buffer(log->node);
+                kfree(log);
+                if (found_key.offset == 0)
+                        break;
+        }
+        btrfs_release_path(log_root_tree, path);
+        /* step one is to pin it all, step two is to replay just inodes */
+        if (wc.pin) {
+                wc.pin = 0;
+                wc.process_func = replay_one_buffer;
+                wc.stage = LOG_WALK_REPLAY_INODES;
+                goto again;
+        }
+        /* step three is to replay everything */
+        if (wc.stage < LOG_WALK_REPLAY_ALL) {
+                wc.stage++;
+                goto again;
+        }
+        btrfs_free_path(path);
+        free_extent_buffer(log_root_tree->node);
+        log_root_tree->log_root = NULL;
+        fs_info->log_root_recovering = 0;
+        /* step 4: commit the transaction, which also unpins the blocks */
+        btrfs_commit_transaction(trans, fs_info->tree_root);
+        kfree(log_root_tree);
+        return 0;
+}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 000000000000..b9409b32ed02
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+                   struct btrfs_root *root);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct dentry *dentry);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, struct dentry *dentry);
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct inode *inode,
+                    int inode_only);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 const char *name, int name_len,
+                                 struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               const char *name, int name_len,
+                               struct inode *inode, u64 dirid);
+#endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 000000000000..9bf3946d5ef2
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
+#ifndef __BTRFS_VERSION_H
+#define __BTRFS_VERSION_H
+#define BTRFS_BUILD_VERSION "Btrfs"
+#endif
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 000000000000..1ca1952fd917
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# determine-version -- report a useful version for releases
+#
+# Copyright 2008, Aron Griffis <agriffis@n01se.net>
+# Copyright 2008, Oracle
+# Released under the GNU GPLv2
+ 
+v="v0.16"
+which git &> /dev/null
+if [ $? == 0 ]; then
+    git branch >& /dev/null
+    if [ $? == 0 ]; then
+            if head=`git rev-parse --verify HEAD 2>/dev/null`; then
+                if tag=`git describe --tags 2>/dev/null`; then
+                    v="$tag"
+                fi
+                # Are there uncommitted changes?
+                git update-index --refresh --unmerged > /dev/null
+                if git diff-index --name-only HEAD | \
+                    grep -v "^scripts/package" \
+                    | read dummy; then
+                    v="$v"-dirty
+                fi
+            fi
+    fi
+fi
+ 
+echo "#ifndef __BUILD_VERSION" > .build-version.h
+echo "#define __BUILD_VERSION" >> .build-version.h
+echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
+echo "#endif" >> .build-version.h
+diff -q version.h .build-version.h >& /dev/null
+if [ $? == 0 ]; then
+    rm .build-version.h
+    exit 0
+fi
+mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..b187b537888e
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,3218 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/version.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+struct map_lookup {
+        u64 type;
+        int io_align;
+        int io_width;
+        int stripe_len;
+        int sector_size;
+        int num_stripes;
+        int sub_stripes;
+        struct btrfs_bio_stripe stripes[];
+};
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+                            (sizeof(struct btrfs_bio_stripe) * (n)))
+static DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+void btrfs_lock_volumes(void)
+{
+        mutex_lock(&uuid_mutex);
+}
+void btrfs_unlock_volumes(void)
+{
+        mutex_unlock(&uuid_mutex);
+}
+static void lock_chunks(struct btrfs_root *root)
+{
+        mutex_lock(&root->fs_info->chunk_mutex);
+}
+static void unlock_chunks(struct btrfs_root *root)
+{
+        mutex_unlock(&root->fs_info->chunk_mutex);
+}
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+        struct btrfs_device *device;
+        WARN_ON(fs_devices->opened);
+        while (!list_empty(&fs_devices->devices)) {
+                device = list_entry(fs_devices->devices.next,
+                                    struct btrfs_device, dev_list);
+                list_del(&device->dev_list);
+                kfree(device->name);
+                kfree(device);
+        }
+        kfree(fs_devices);
+}
+int btrfs_cleanup_fs_uuids(void)
+{
+        struct btrfs_fs_devices *fs_devices;
+        while (!list_empty(&fs_uuids)) {
+                fs_devices = list_entry(fs_uuids.next,
+                                        struct btrfs_fs_devices, list);
+                list_del(&fs_devices->list);
+                free_fs_devices(fs_devices);
+        }
+        return 0;
+}
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+                                                   u64 devid, u8 *uuid)
+{
+        struct btrfs_device *dev;
+        struct list_head *cur;
+        list_for_each(cur, head) {
+                dev = list_entry(cur, struct btrfs_device, dev_list);
+                if (dev->devid == devid &&
+                    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
+                        return dev;
+                }
+        }
+        return NULL;
+}
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+        struct list_head *cur;
+        struct btrfs_fs_devices *fs_devices;
+        list_for_each(cur, &fs_uuids) {
+                fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+                if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+                        return fs_devices;
+        }
+        return NULL;
+}
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+static noinline int run_scheduled_bios(struct btrfs_device *device)
+{
+        struct bio *pending;
+        struct backing_dev_info *bdi;
+        struct btrfs_fs_info *fs_info;
+        struct bio *tail;
+        struct bio *cur;
+        int again = 0;
+        unsigned long num_run = 0;
+        unsigned long limit;
+        bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+        fs_info = device->dev_root->fs_info;
+        limit = btrfs_async_submit_limit(fs_info);
+        limit = limit * 2 / 3;
+loop:
+        spin_lock(&device->io_lock);
+        /* take all the bios off the list at once and process them
+         * later on (without the lock held).  But, remember the
+         * tail and other pointers so the bios can be properly reinserted
+         * into the list if we hit congestion
+         */
+        pending = device->pending_bios;
+        tail = device->pending_bio_tail;
+        WARN_ON(pending && !tail);
+        device->pending_bios = NULL;
+        device->pending_bio_tail = NULL;
+        /*
+         * if pending was null this time around, no bios need processing
+         * at all and we can stop.  Otherwise it'll loop back up again
+         * and do an additional check so no bios are missed.
+         *
+         * device->running_pending is used to synchronize with the
+         * schedule_bio code.
+         */
+        if (pending) {
+                again = 1;
+                device->running_pending = 1;
+        } else {
+                again = 0;
+                device->running_pending = 0;
+        }
+        spin_unlock(&device->io_lock);
+        while (pending) {
+                cur = pending;
+                pending = pending->bi_next;
+                cur->bi_next = NULL;
+                atomic_dec(&fs_info->nr_async_bios);
+                if (atomic_read(&fs_info->nr_async_bios) < limit &&
+                    waitqueue_active(&fs_info->async_submit_wait))
+                        wake_up(&fs_info->async_submit_wait);
+                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+                bio_get(cur);
+                submit_bio(cur->bi_rw, cur);
+                bio_put(cur);
+                num_run++;
+                /*
+                 * we made progress, there is more work to do and the bdi
+                 * is now congested.  Back off and let other work structs
+                 * run instead
+                 */
+                if (pending && bdi_write_congested(bdi) &&
+                    fs_info->fs_devices->open_devices > 1) {
+                        struct bio *old_head;
+                        spin_lock(&device->io_lock);
+                        old_head = device->pending_bios;
+                        device->pending_bios = pending;
+                        if (device->pending_bio_tail)
+                                tail->bi_next = old_head;
+                        else
+                                device->pending_bio_tail = tail;
+                        spin_unlock(&device->io_lock);
+                        btrfs_requeue_work(&device->work);
+                        goto done;
+                }
+        }
+        if (again)
+                goto loop;
+done:
+        return 0;
+}
+static void pending_bios_fn(struct btrfs_work *work)
+{
+        struct btrfs_device *device;
+        device = container_of(work, struct btrfs_device, work);
+        run_scheduled_bios(device);
+}
+static noinline int device_list_add(const char *path,
+                           struct btrfs_super_block *disk_super,
+                           u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *fs_devices;
+        u64 found_transid = btrfs_super_generation(disk_super);
+        fs_devices = find_fsid(disk_super->fsid);
+        if (!fs_devices) {
+                fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+                if (!fs_devices)
+                        return -ENOMEM;
+                INIT_LIST_HEAD(&fs_devices->devices);
+                INIT_LIST_HEAD(&fs_devices->alloc_list);
+                list_add(&fs_devices->list, &fs_uuids);
+                memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+                fs_devices->latest_devid = devid;
+                fs_devices->latest_trans = found_transid;
+                device = NULL;
+        } else {
+                device = __find_device(&fs_devices->devices, devid,
+                                       disk_super->dev_item.uuid);
+        }
+        if (!device) {
+                if (fs_devices->opened)
+                        return -EBUSY;
+                device = kzalloc(sizeof(*device), GFP_NOFS);
+                if (!device) {
+                        /* we can safely leave the fs_devices entry around */
+                        return -ENOMEM;
+                }
+                device->devid = devid;
+                device->work.func = pending_bios_fn;
+                memcpy(device->uuid, disk_super->dev_item.uuid,
+                       BTRFS_UUID_SIZE);
+                device->barriers = 1;
+                spin_lock_init(&device->io_lock);
+                device->name = kstrdup(path, GFP_NOFS);
+                if (!device->name) {
+                        kfree(device);
+                        return -ENOMEM;
+                }
+                INIT_LIST_HEAD(&device->dev_alloc_list);
+                list_add(&device->dev_list, &fs_devices->devices);
+                device->fs_devices = fs_devices;
+                fs_devices->num_devices++;
+        }
+        if (found_transid > fs_devices->latest_trans) {
+                fs_devices->latest_devid = devid;
+                fs_devices->latest_trans = found_transid;
+        }
+        *fs_devices_ret = fs_devices;
+        return 0;
+}
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+        struct btrfs_fs_devices *fs_devices;
+        struct btrfs_device *device;
+        struct btrfs_device *orig_dev;
+        fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+        if (!fs_devices)
+                return ERR_PTR(-ENOMEM);
+        INIT_LIST_HEAD(&fs_devices->devices);
+        INIT_LIST_HEAD(&fs_devices->alloc_list);
+        INIT_LIST_HEAD(&fs_devices->list);
+        fs_devices->latest_devid = orig->latest_devid;
+        fs_devices->latest_trans = orig->latest_trans;
+        memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+                device = kzalloc(sizeof(*device), GFP_NOFS);
+                if (!device)
+                        goto error;
+                device->name = kstrdup(orig_dev->name, GFP_NOFS);
+                if (!device->name)
+                        goto error;
+                device->devid = orig_dev->devid;
+                device->work.func = pending_bios_fn;
+                memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+                device->barriers = 1;
+                spin_lock_init(&device->io_lock);
+                INIT_LIST_HEAD(&device->dev_list);
+                INIT_LIST_HEAD(&device->dev_alloc_list);
+                list_add(&device->dev_list, &fs_devices->devices);
+                device->fs_devices = fs_devices;
+                fs_devices->num_devices++;
+        }
+        return fs_devices;
+error:
+        free_fs_devices(fs_devices);
+        return ERR_PTR(-ENOMEM);
+}
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+        struct list_head *tmp;
+        struct list_head *cur;
+        struct btrfs_device *device;
+        mutex_lock(&uuid_mutex);
+again:
+        list_for_each_safe(cur, tmp, &fs_devices->devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (device->in_fs_metadata)
+                        continue;
+                if (device->bdev) {
+                        close_bdev_exclusive(device->bdev, device->mode);
+                        device->bdev = NULL;
+                        fs_devices->open_devices--;
+                }
+                if (device->writeable) {
+                        list_del_init(&device->dev_alloc_list);
+                        device->writeable = 0;
+                        fs_devices->rw_devices--;
+                }
+                list_del_init(&device->dev_list);
+                fs_devices->num_devices--;
+                kfree(device->name);
+                kfree(device);
+        }
+        if (fs_devices->seed) {
+                fs_devices = fs_devices->seed;
+                goto again;
+        }
+        mutex_unlock(&uuid_mutex);
+        return 0;
+}
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+        struct list_head *cur;
+        struct btrfs_device *device;
+        if (--fs_devices->opened > 0)
+                return 0;
+        list_for_each(cur, &fs_devices->devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (device->bdev) {
+                        close_bdev_exclusive(device->bdev, device->mode);
+                        fs_devices->open_devices--;
+                }
+                if (device->writeable) {
+                        list_del_init(&device->dev_alloc_list);
+                        fs_devices->rw_devices--;
+                }
+                device->bdev = NULL;
+                device->writeable = 0;
+                device->in_fs_metadata = 0;
+        }
+        WARN_ON(fs_devices->open_devices);
+        WARN_ON(fs_devices->rw_devices);
+        fs_devices->opened = 0;
+        fs_devices->seeding = 0;
+        return 0;
+}
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+        struct btrfs_fs_devices *seed_devices = NULL;
+        int ret;
+        mutex_lock(&uuid_mutex);
+        ret = __btrfs_close_devices(fs_devices);
+        if (!fs_devices->opened) {
+                seed_devices = fs_devices->seed;
+                fs_devices->seed = NULL;
+        }
+        mutex_unlock(&uuid_mutex);
+        while (seed_devices) {
+                fs_devices = seed_devices;
+                seed_devices = fs_devices->seed;
+                __btrfs_close_devices(fs_devices);
+                free_fs_devices(fs_devices);
+        }
+        return ret;
+}
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+                                fmode_t flags, void *holder)
+{
+        struct block_device *bdev;
+        struct list_head *head = &fs_devices->devices;
+        struct list_head *cur;
+        struct btrfs_device *device;
+        struct block_device *latest_bdev = NULL;
+        struct buffer_head *bh;
+        struct btrfs_super_block *disk_super;
+        u64 latest_devid = 0;
+        u64 latest_transid = 0;
+        u64 devid;
+        int seeding = 1;
+        int ret = 0;
+        list_for_each(cur, head) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (device->bdev)
+                        continue;
+                if (!device->name)
+                        continue;
+                bdev = open_bdev_exclusive(device->name, flags, holder);
+                if (IS_ERR(bdev)) {
+                        printk(KERN_INFO "open %s failed\n", device->name);
+                        goto error;
+                }
+                set_blocksize(bdev, 4096);
+                bh = btrfs_read_dev_super(bdev);
+                if (!bh)
+                        goto error_close;
+                disk_super = (struct btrfs_super_block *)bh->b_data;
+                devid = le64_to_cpu(disk_super->dev_item.devid);
+                if (devid != device->devid)
+                        goto error_brelse;
+                if (memcmp(device->uuid, disk_super->dev_item.uuid,
+                           BTRFS_UUID_SIZE))
+                        goto error_brelse;
+                device->generation = btrfs_super_generation(disk_super);
+                if (!latest_transid || device->generation > latest_transid) {
+                        latest_devid = devid;
+                        latest_transid = device->generation;
+                        latest_bdev = bdev;
+                }
+                if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+                        device->writeable = 0;
+                } else {
+                        device->writeable = !bdev_read_only(bdev);
+                        seeding = 0;
+                }
+                device->bdev = bdev;
+                device->in_fs_metadata = 0;
+                device->mode = flags;
+                fs_devices->open_devices++;
+                if (device->writeable) {
+                        fs_devices->rw_devices++;
+                        list_add(&device->dev_alloc_list,
+                                 &fs_devices->alloc_list);
+                }
+                continue;
+error_brelse:
+                brelse(bh);
+error_close:
+                close_bdev_exclusive(bdev, FMODE_READ);
+error:
+                continue;
+        }
+        if (fs_devices->open_devices == 0) {
+                ret = -EIO;
+                goto out;
+        }
+        fs_devices->seeding = seeding;
+        fs_devices->opened = 1;
+        fs_devices->latest_bdev = latest_bdev;
+        fs_devices->latest_devid = latest_devid;
+        fs_devices->latest_trans = latest_transid;
+        fs_devices->total_rw_bytes = 0;
+out:
+        return ret;
+}
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+                       fmode_t flags, void *holder)
+{
+        int ret;
+        mutex_lock(&uuid_mutex);
+        if (fs_devices->opened) {
+                fs_devices->opened++;
+                ret = 0;
+        } else {
+                ret = __btrfs_open_devices(fs_devices, flags, holder);
+        }
+        mutex_unlock(&uuid_mutex);
+        return ret;
+}
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+                          struct btrfs_fs_devices **fs_devices_ret)
+{
+        struct btrfs_super_block *disk_super;
+        struct block_device *bdev;
+        struct buffer_head *bh;
+        int ret;
+        u64 devid;
+        u64 transid;
+        mutex_lock(&uuid_mutex);
+        bdev = open_bdev_exclusive(path, flags, holder);
+        if (IS_ERR(bdev)) {
+                ret = PTR_ERR(bdev);
+                goto error;
+        }
+        ret = set_blocksize(bdev, 4096);
+        if (ret)
+                goto error_close;
+        bh = btrfs_read_dev_super(bdev);
+        if (!bh) {
+                ret = -EIO;
+                goto error_close;
+        }
+        disk_super = (struct btrfs_super_block *)bh->b_data;
+        devid = le64_to_cpu(disk_super->dev_item.devid);
+        transid = btrfs_super_generation(disk_super);
+        if (disk_super->label[0])
+                printk(KERN_INFO "device label %s ", disk_super->label);
+        else {
+                /* FIXME, make a readl uuid parser */
+                printk(KERN_INFO "device fsid %llx-%llx ",
+                       *(unsigned long long *)disk_super->fsid,
+                       *(unsigned long long *)(disk_super->fsid + 8));
+        }
+        printk(KERN_INFO "devid %llu transid %llu %s\n",
+               (unsigned long long)devid, (unsigned long long)transid, path);
+        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+        brelse(bh);
+error_close:
+        close_bdev_exclusive(bdev, flags);
+error:
+        mutex_unlock(&uuid_mutex);
+        return ret;
+}
+/*
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+                                         struct btrfs_device *device,
+                                         u64 num_bytes, u64 *start)
+{
+        struct btrfs_key key;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_dev_extent *dev_extent = NULL;
+        struct btrfs_path *path;
+        u64 hole_size = 0;
+        u64 last_byte = 0;
+        u64 search_start = 0;
+        u64 search_end = device->total_bytes;
+        int ret;
+        int slot = 0;
+        int start_found;
+        struct extent_buffer *l;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        start_found = 0;
+        /* FIXME use last free of some kind */
+        /* we don't want to overwrite the superblock on the drive,
+         * so we make sure to start at an offset of at least 1MB
+         */
+        search_start = max((u64)1024 * 1024, search_start);
+        if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+                search_start = max(root->fs_info->alloc_start, search_start);
+        key.objectid = device->devid;
+        key.offset = search_start;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        ret = btrfs_previous_item(root, path, 0, key.type);
+        if (ret < 0)
+                goto error;
+        l = path->nodes[0];
+        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+        while (1) {
+                l = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(l)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto error;
+no_more_items:
+                        if (!start_found) {
+                                if (search_start >= search_end) {
+                                        ret = -ENOSPC;
+                                        goto error;
+                                }
+                                *start = search_start;
+                                start_found = 1;
+                                goto check_pending;
+                        }
+                        *start = last_byte > search_start ?
+                                last_byte : search_start;
+                        if (search_end <= *start) {
+                                ret = -ENOSPC;
+                                goto error;
+                        }
+                        goto check_pending;
+                }
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid < device->devid)
+                        goto next;
+                if (key.objectid > device->devid)
+                        goto no_more_items;
+                if (key.offset >= search_start && key.offset > last_byte &&
+                    start_found) {
+                        if (last_byte < search_start)
+                                last_byte = search_start;
+                        hole_size = key.offset - last_byte;
+                        if (key.offset > last_byte &&
+                            hole_size >= num_bytes) {
+                                *start = last_byte;
+                                goto check_pending;
+                        }
+                }
+                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                        goto next;
+                start_found = 1;
+                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+                last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+next:
+                path->slots[0]++;
+                cond_resched();
+        }
+check_pending:
+        /* we have to make sure we didn't find an extent that has already
+         * been allocated by the map tree or the original allocation
+         */
+        BUG_ON(*start < search_start);
+        if (*start + num_bytes > search_end) {
+                ret = -ENOSPC;
+                goto error;
+        }
+        /* check for pending inserts here */
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+                          struct btrfs_device *device,
+                          u64 start)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct extent_buffer *leaf = NULL;
+        struct btrfs_dev_extent *extent = NULL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = device->devid;
+        key.offset = start;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0) {
+                ret = btrfs_previous_item(root, path, key.objectid,
+                                          BTRFS_DEV_EXTENT_KEY);
+                BUG_ON(ret);
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                extent = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_dev_extent);
+                BUG_ON(found_key.offset > start || found_key.offset +
+                       btrfs_dev_extent_length(leaf, extent) < start);
+                ret = 0;
+        } else if (ret == 0) {
+                leaf = path->nodes[0];
+                extent = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_dev_extent);
+        }
+        BUG_ON(ret);
+        if (device->bytes_used > 0)
+                device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+        ret = btrfs_del_item(trans, root, path);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+                           struct btrfs_device *device,
+                           u64 chunk_tree, u64 chunk_objectid,
+                           u64 chunk_offset, u64 start, u64 num_bytes)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_dev_extent *extent;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        WARN_ON(!device->in_fs_metadata);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = device->devid;
+        key.offset = start;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      sizeof(*extent));
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        extent = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_dev_extent);
+        btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+        btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+        btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+        write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+                    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
+                    BTRFS_UUID_SIZE);
+        btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        return ret;
+}
+static noinline int find_next_chunk(struct btrfs_root *root,
+                                    u64 objectid, u64 *offset)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_chunk *chunk;
+        struct btrfs_key found_key;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = objectid;
+        key.offset = (u64)-1;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        BUG_ON(ret == 0);
+        ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+        if (ret) {
+                *offset = 0;
+        } else {
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                if (found_key.objectid != objectid)
+                        *offset = 0;
+                else {
+                        chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                               struct btrfs_chunk);
+                        *offset = found_key.offset +
+                                btrfs_chunk_length(path->nodes[0], chunk);
+                }
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
+{
+        int ret;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_path *path;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        key.offset = (u64)-1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto error;
+        BUG_ON(ret == 0);
+        ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+                                  BTRFS_DEV_ITEM_KEY);
+        if (ret) {
+                *objectid = 1;
+        } else {
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                *objectid = found_key.offset + 1;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_device *device)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_dev_item *dev_item;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        unsigned long ptr;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        key.offset = device->devid;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      sizeof(*dev_item));
+        if (ret)
+                goto out;
+        leaf = path->nodes[0];
+        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+        btrfs_set_device_id(leaf, dev_item, device->devid);
+        btrfs_set_device_generation(leaf, dev_item, 0);
+        btrfs_set_device_type(leaf, dev_item, device->type);
+        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+        btrfs_set_device_group(leaf, dev_item, 0);
+        btrfs_set_device_seek_speed(leaf, dev_item, 0);
+        btrfs_set_device_bandwidth(leaf, dev_item, 0);
+        btrfs_set_device_start_offset(leaf, dev_item, 0);
+        ptr = (unsigned long)btrfs_device_uuid(dev_item);
+        write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+        ptr = (unsigned long)btrfs_device_fsid(dev_item);
+        write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+                             struct btrfs_device *device)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_trans_handle *trans;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        trans = btrfs_start_transaction(root, 1);
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        key.offset = device->devid;
+        lock_chunks(root);
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        ret = btrfs_del_item(trans, root, path);
+        if (ret)
+                goto out;
+out:
+        btrfs_free_path(path);
+        unlock_chunks(root);
+        btrfs_commit_transaction(trans, root);
+        return ret;
+}
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+        struct btrfs_device *device;
+        struct btrfs_device *next_device;
+        struct block_device *bdev;
+        struct buffer_head *bh = NULL;
+        struct btrfs_super_block *disk_super;
+        u64 all_avail;
+        u64 devid;
+        u64 num_devices;
+        u8 *dev_uuid;
+        int ret = 0;
+        mutex_lock(&uuid_mutex);
+        mutex_lock(&root->fs_info->volume_mutex);
+        all_avail = root->fs_info->avail_data_alloc_bits |
+                root->fs_info->avail_system_alloc_bits |
+                root->fs_info->avail_metadata_alloc_bits;
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+            root->fs_info->fs_devices->rw_devices <= 4) {
+                printk(KERN_ERR "btrfs: unable to go below four devices "
+                       "on raid10\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+            root->fs_info->fs_devices->rw_devices <= 2) {
+                printk(KERN_ERR "btrfs: unable to go below two "
+                       "devices on raid1\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        if (strcmp(device_path, "missing") == 0) {
+                struct list_head *cur;
+                struct list_head *devices;
+                struct btrfs_device *tmp;
+                device = NULL;
+                devices = &root->fs_info->fs_devices->devices;
+                list_for_each(cur, devices) {
+                        tmp = list_entry(cur, struct btrfs_device, dev_list);
+                        if (tmp->in_fs_metadata && !tmp->bdev) {
+                                device = tmp;
+                                break;
+                        }
+                }
+                bdev = NULL;
+                bh = NULL;
+                disk_super = NULL;
+                if (!device) {
+                        printk(KERN_ERR "btrfs: no missing devices found to "
+                               "remove\n");
+                        goto out;
+                }
+        } else {
+                bdev = open_bdev_exclusive(device_path, FMODE_READ,
+                                      root->fs_info->bdev_holder);
+                if (IS_ERR(bdev)) {
+                        ret = PTR_ERR(bdev);
+                        goto out;
+                }
+                set_blocksize(bdev, 4096);
+                bh = btrfs_read_dev_super(bdev);
+                if (!bh) {
+                        ret = -EIO;
+                        goto error_close;
+                }
+                disk_super = (struct btrfs_super_block *)bh->b_data;
+                devid = le64_to_cpu(disk_super->dev_item.devid);
+                dev_uuid = disk_super->dev_item.uuid;
+                device = btrfs_find_device(root, devid, dev_uuid,
+                                           disk_super->fsid);
+                if (!device) {
+                        ret = -ENOENT;
+                        goto error_brelse;
+                }
+        }
+        if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+                printk(KERN_ERR "btrfs: unable to remove the only writeable "
+                       "device\n");
+                ret = -EINVAL;
+                goto error_brelse;
+        }
+        if (device->writeable) {
+                list_del_init(&device->dev_alloc_list);
+                root->fs_info->fs_devices->rw_devices--;
+        }
+        ret = btrfs_shrink_device(device, 0);
+        if (ret)
+                goto error_brelse;
+        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+        if (ret)
+                goto error_brelse;
+        device->in_fs_metadata = 0;
+        list_del_init(&device->dev_list);
+        device->fs_devices->num_devices--;
+        next_device = list_entry(root->fs_info->fs_devices->devices.next,
+                                 struct btrfs_device, dev_list);
+        if (device->bdev == root->fs_info->sb->s_bdev)
+                root->fs_info->sb->s_bdev = next_device->bdev;
+        if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+        if (device->bdev) {
+                close_bdev_exclusive(device->bdev, device->mode);
+                device->bdev = NULL;
+                device->fs_devices->open_devices--;
+        }
+        num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+        btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+        if (device->fs_devices->open_devices == 0) {
+                struct btrfs_fs_devices *fs_devices;
+                fs_devices = root->fs_info->fs_devices;
+                while (fs_devices) {
+                        if (fs_devices->seed == device->fs_devices)
+                                break;
+                        fs_devices = fs_devices->seed;
+                }
+                fs_devices->seed = device->fs_devices->seed;
+                device->fs_devices->seed = NULL;
+                __btrfs_close_devices(device->fs_devices);
+                free_fs_devices(device->fs_devices);
+        }
+        /*
+         * at this point, the device is zero sized.  We want to
+         * remove it from the devices list and zero out the old super
+         */
+        if (device->writeable) {
+                /* make sure this device isn't detected as part of
+                 * the FS anymore
+                 */
+                memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+                set_buffer_dirty(bh);
+                sync_dirty_buffer(bh);
+        }
+        kfree(device->name);
+        kfree(device);
+        ret = 0;
+error_brelse:
+        brelse(bh);
+error_close:
+        if (bdev)
+                close_bdev_exclusive(bdev, FMODE_READ);
+out:
+        mutex_unlock(&root->fs_info->volume_mutex);
+        mutex_unlock(&uuid_mutex);
+        return ret;
+}
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root)
+{
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        struct btrfs_fs_devices *old_devices;
+        struct btrfs_fs_devices *seed_devices;
+        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+        struct btrfs_device *device;
+        u64 super_flags;
+        BUG_ON(!mutex_is_locked(&uuid_mutex));
+        if (!fs_devices->seeding)
+                return -EINVAL;
+        seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+        if (!seed_devices)
+                return -ENOMEM;
+        old_devices = clone_fs_devices(fs_devices);
+        if (IS_ERR(old_devices)) {
+                kfree(seed_devices);
+                return PTR_ERR(old_devices);
+        }
+        list_add(&old_devices->list, &fs_uuids);
+        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+        seed_devices->opened = 1;
+        INIT_LIST_HEAD(&seed_devices->devices);
+        INIT_LIST_HEAD(&seed_devices->alloc_list);
+        list_splice_init(&fs_devices->devices, &seed_devices->devices);
+        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+        list_for_each_entry(device, &seed_devices->devices, dev_list) {
+                device->fs_devices = seed_devices;
+        }
+        fs_devices->seeding = 0;
+        fs_devices->num_devices = 0;
+        fs_devices->open_devices = 0;
+        fs_devices->seed = seed_devices;
+        generate_random_uuid(fs_devices->fsid);
+        memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+        memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+        super_flags = btrfs_super_flags(disk_super) &
+                      ~BTRFS_SUPER_FLAG_SEEDING;
+        btrfs_set_super_flags(disk_super, super_flags);
+        return 0;
+}
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_dev_item *dev_item;
+        struct btrfs_device *device;
+        struct btrfs_key key;
+        u8 fs_uuid[BTRFS_UUID_SIZE];
+        u8 dev_uuid[BTRFS_UUID_SIZE];
+        u64 devid;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        root = root->fs_info->chunk_root;
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.offset = 0;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        while (1) {
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                if (ret < 0)
+                        goto error;
+                leaf = path->nodes[0];
+next_slot:
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret > 0)
+                                break;
+                        if (ret < 0)
+                                goto error;
+                        leaf = path->nodes[0];
+                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                        btrfs_release_path(root, path);
+                        continue;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+                    key.type != BTRFS_DEV_ITEM_KEY)
+                        break;
+                dev_item = btrfs_item_ptr(leaf, path->slots[0],
+                                          struct btrfs_dev_item);
+                devid = btrfs_device_id(leaf, dev_item);
+                read_extent_buffer(leaf, dev_uuid,
+                                   (unsigned long)btrfs_device_uuid(dev_item),
+                                   BTRFS_UUID_SIZE);
+                read_extent_buffer(leaf, fs_uuid,
+                                   (unsigned long)btrfs_device_fsid(dev_item),
+                                   BTRFS_UUID_SIZE);
+                device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+                BUG_ON(!device);
+                if (device->fs_devices->seeding) {
+                        btrfs_set_device_generation(leaf, dev_item,
+                                                    device->generation);
+                        btrfs_mark_buffer_dirty(leaf);
+                }
+                path->slots[0]++;
+                goto next_slot;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_device *device;
+        struct block_device *bdev;
+        struct list_head *cur;
+        struct list_head *devices;
+        struct super_block *sb = root->fs_info->sb;
+        u64 total_bytes;
+        int seeding_dev = 0;
+        int ret = 0;
+        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+                return -EINVAL;
+        bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+        if (!bdev)
+                return -EIO;
+        if (root->fs_info->fs_devices->seeding) {
+                seeding_dev = 1;
+                down_write(&sb->s_umount);
+                mutex_lock(&uuid_mutex);
+        }
+        filemap_write_and_wait(bdev->bd_inode->i_mapping);
+        mutex_lock(&root->fs_info->volume_mutex);
+        devices = &root->fs_info->fs_devices->devices;
+        list_for_each(cur, devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                if (device->bdev == bdev) {
+                        ret = -EEXIST;
+                        goto error;
+                }
+        }
+        device = kzalloc(sizeof(*device), GFP_NOFS);
+        if (!device) {
+                /* we can safely leave the fs_devices entry around */
+                ret = -ENOMEM;
+                goto error;
+        }
+        device->name = kstrdup(device_path, GFP_NOFS);
+        if (!device->name) {
+                kfree(device);
+                ret = -ENOMEM;
+                goto error;
+        }
+        ret = find_next_devid(root, &device->devid);
+        if (ret) {
+                kfree(device);
+                goto error;
+        }
+        trans = btrfs_start_transaction(root, 1);
+        lock_chunks(root);
+        device->barriers = 1;
+        device->writeable = 1;
+        device->work.func = pending_bios_fn;
+        generate_random_uuid(device->uuid);
+        spin_lock_init(&device->io_lock);
+        device->generation = trans->transid;
+        device->io_width = root->sectorsize;
+        device->io_align = root->sectorsize;
+        device->sector_size = root->sectorsize;
+        device->total_bytes = i_size_read(bdev->bd_inode);
+        device->dev_root = root->fs_info->dev_root;
+        device->bdev = bdev;
+        device->in_fs_metadata = 1;
+        device->mode = 0;
+        set_blocksize(device->bdev, 4096);
+        if (seeding_dev) {
+                sb->s_flags &= ~MS_RDONLY;
+                ret = btrfs_prepare_sprout(trans, root);
+                BUG_ON(ret);
+        }
+        device->fs_devices = root->fs_info->fs_devices;
+        list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+        list_add(&device->dev_alloc_list,
+                 &root->fs_info->fs_devices->alloc_list);
+        root->fs_info->fs_devices->num_devices++;
+        root->fs_info->fs_devices->open_devices++;
+        root->fs_info->fs_devices->rw_devices++;
+        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+        total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+                                    total_bytes + device->total_bytes);
+        total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+        btrfs_set_super_num_devices(&root->fs_info->super_copy,
+                                    total_bytes + 1);
+        if (seeding_dev) {
+                ret = init_first_rw_device(trans, root, device);
+                BUG_ON(ret);
+                ret = btrfs_finish_sprout(trans, root);
+                BUG_ON(ret);
+        } else {
+                ret = btrfs_add_device(trans, root, device);
+        }
+        unlock_chunks(root);
+        btrfs_commit_transaction(trans, root);
+        if (seeding_dev) {
+                mutex_unlock(&uuid_mutex);
+                up_write(&sb->s_umount);
+                ret = btrfs_relocate_sys_chunks(root);
+                BUG_ON(ret);
+        }
+out:
+        mutex_unlock(&root->fs_info->volume_mutex);
+        return ret;
+error:
+        close_bdev_exclusive(bdev, 0);
+        if (seeding_dev) {
+                mutex_unlock(&uuid_mutex);
+                up_write(&sb->s_umount);
+        }
+        goto out;
+}
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+                                        struct btrfs_device *device)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_root *root;
+        struct btrfs_dev_item *dev_item;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        root = device->dev_root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.type = BTRFS_DEV_ITEM_KEY;
+        key.offset = device->devid;
+        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+        btrfs_set_device_id(leaf, dev_item, device->devid);
+        btrfs_set_device_type(leaf, dev_item, device->type);
+        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+        btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+        btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
+                      struct btrfs_device *device, u64 new_size)
+{
+        struct btrfs_super_block *super_copy =
+                &device->dev_root->fs_info->super_copy;
+        u64 old_total = btrfs_super_total_bytes(super_copy);
+        u64 diff = new_size - device->total_bytes;
+        if (!device->writeable)
+                return -EACCES;
+        if (new_size <= device->total_bytes)
+                return -EINVAL;
+        btrfs_set_super_total_bytes(super_copy, old_total + diff);
+        device->fs_devices->total_rw_bytes += diff;
+        device->total_bytes = new_size;
+        return btrfs_update_device(trans, device);
+}
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+                      struct btrfs_device *device, u64 new_size)
+{
+        int ret;
+        lock_chunks(device->dev_root);
+        ret = __btrfs_grow_device(trans, device, new_size);
+        unlock_chunks(device->dev_root);
+        return ret;
+}
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            u64 chunk_tree, u64 chunk_objectid,
+                            u64 chunk_offset)
+{
+        int ret;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = chunk_objectid;
+        key.offset = chunk_offset;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        BUG_ON(ret);
+        ret = btrfs_del_item(trans, root, path);
+        BUG_ON(ret);
+        btrfs_free_path(path);
+        return 0;
+}
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+                        chunk_offset)
+{
+        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_disk_key *disk_key;
+        struct btrfs_chunk *chunk;
+        u8 *ptr;
+        int ret = 0;
+        u32 num_stripes;
+        u32 array_size;
+        u32 len = 0;
+        u32 cur;
+        struct btrfs_key key;
+        array_size = btrfs_super_sys_array_size(super_copy);
+        ptr = super_copy->sys_chunk_array;
+        cur = 0;
+        while (cur < array_size) {
+                disk_key = (struct btrfs_disk_key *)ptr;
+                btrfs_disk_key_to_cpu(&key, disk_key);
+                len = sizeof(*disk_key);
+                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+                        chunk = (struct btrfs_chunk *)(ptr + len);
+                        num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+                        len += btrfs_chunk_item_size(num_stripes);
+                } else {
+                        ret = -EIO;
+                        break;
+                }
+                if (key.objectid == chunk_objectid &&
+                    key.offset == chunk_offset) {
+                        memmove(ptr, ptr + len, array_size - (cur + len));
+                        array_size -= len;
+                        btrfs_set_super_sys_array_size(super_copy, array_size);
+                } else {
+                        ptr += len;
+                        cur += len;
+                }
+        }
+        return ret;
+}
+static int btrfs_relocate_chunk(struct btrfs_root *root,
+                         u64 chunk_tree, u64 chunk_objectid,
+                         u64 chunk_offset)
+{
+        struct extent_map_tree *em_tree;
+        struct btrfs_root *extent_root;
+        struct btrfs_trans_handle *trans;
+        struct extent_map *em;
+        struct map_lookup *map;
+        int ret;
+        int i;
+        printk(KERN_INFO "btrfs relocating chunk %llu\n",
+               (unsigned long long)chunk_offset);
+        root = root->fs_info->chunk_root;
+        extent_root = root->fs_info->extent_root;
+        em_tree = &root->fs_info->mapping_tree.map_tree;
+        /* step one, relocate all the extents inside this chunk */
+        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+        BUG_ON(ret);
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        lock_chunks(root);
+        /*
+         * step two, delete the device extents and the
+         * chunk tree entries
+         */
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+        spin_unlock(&em_tree->lock);
+        BUG_ON(em->start > chunk_offset ||
+               em->start + em->len < chunk_offset);
+        map = (struct map_lookup *)em->bdev;
+        for (i = 0; i < map->num_stripes; i++) {
+                ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+                                            map->stripes[i].physical);
+                BUG_ON(ret);
+                if (map->stripes[i].dev) {
+                        ret = btrfs_update_device(trans, map->stripes[i].dev);
+                        BUG_ON(ret);
+                }
+        }
+        ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+                               chunk_offset);
+        BUG_ON(ret);
+        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+                ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+                BUG_ON(ret);
+        }
+        ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+        BUG_ON(ret);
+        spin_lock(&em_tree->lock);
+        remove_extent_mapping(em_tree, em);
+        spin_unlock(&em_tree->lock);
+        kfree(map);
+        em->bdev = NULL;
+        /* once for the tree */
+        free_extent_map(em);
+        /* once for us */
+        free_extent_map(em);
+        unlock_chunks(root);
+        btrfs_end_transaction(trans, root);
+        return 0;
+}
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+        struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_chunk *chunk;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        u64 chunk_tree = chunk_root->root_key.objectid;
+        u64 chunk_type;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+        key.offset = (u64)-1;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        while (1) {
+                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto error;
+                BUG_ON(ret == 0);
+                ret = btrfs_previous_item(chunk_root, path, key.objectid,
+                                          key.type);
+                if (ret < 0)
+                        goto error;
+                if (ret > 0)
+                        break;
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                chunk = btrfs_item_ptr(leaf, path->slots[0],
+                                       struct btrfs_chunk);
+                chunk_type = btrfs_chunk_type(leaf, chunk);
+                btrfs_release_path(chunk_root, path);
+                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+                        ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+                                                   found_key.objectid,
+                                                   found_key.offset);
+                        BUG_ON(ret);
+                }
+                if (found_key.offset == 0)
+                        break;
+                key.offset = found_key.offset - 1;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
+static u64 div_factor(u64 num, int factor)
+{
+        if (factor == 10)
+                return num;
+        num *= factor;
+        do_div(num, 10);
+        return num;
+}
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+        int ret;
+        struct list_head *cur;
+        struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+        struct btrfs_device *device;
+        u64 old_size;
+        u64 size_to_free;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct btrfs_chunk *chunk;
+        struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_key found_key;
+        if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        mutex_lock(&dev_root->fs_info->volume_mutex);
+        dev_root = dev_root->fs_info->dev_root;
+        /* step one make some room on all the devices */
+        list_for_each(cur, devices) {
+                device = list_entry(cur, struct btrfs_device, dev_list);
+                old_size = device->total_bytes;
+                size_to_free = div_factor(old_size, 1);
+                size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+                if (!device->writeable ||
+                    device->total_bytes - device->bytes_used > size_to_free)
+                        continue;
+                ret = btrfs_shrink_device(device, old_size - size_to_free);
+                BUG_ON(ret);
+                trans = btrfs_start_transaction(dev_root, 1);
+                BUG_ON(!trans);
+                ret = btrfs_grow_device(trans, device, old_size);
+                BUG_ON(ret);
+                btrfs_end_transaction(trans, dev_root);
+        }
+        /* step two, relocate all the chunks */
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+        key.offset = (u64)-1;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        while (1) {
+                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto error;
+                /*
+                 * this shouldn't happen, it means the last relocate
+                 * failed
+                 */
+                if (ret == 0)
+                        break;
+                ret = btrfs_previous_item(chunk_root, path, 0,
+                                          BTRFS_CHUNK_ITEM_KEY);
+                if (ret)
+                        break;
+                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+                                      path->slots[0]);
+                if (found_key.objectid != key.objectid)
+                        break;
+                chunk = btrfs_item_ptr(path->nodes[0],
+                                       path->slots[0],
+                                       struct btrfs_chunk);
+                key.offset = found_key.offset;
+                /* chunk zero is special */
+                if (key.offset == 0)
+                        break;
+                btrfs_release_path(chunk_root, path);
+                ret = btrfs_relocate_chunk(chunk_root,
+                                           chunk_root->root_key.objectid,
+                                           found_key.objectid,
+                                           found_key.offset);
+                BUG_ON(ret);
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        mutex_unlock(&dev_root->fs_info->volume_mutex);
+        return ret;
+}
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_dev_extent *dev_extent = NULL;
+        struct btrfs_path *path;
+        u64 length;
+        u64 chunk_tree;
+        u64 chunk_objectid;
+        u64 chunk_offset;
+        int ret;
+        int slot;
+        struct extent_buffer *l;
+        struct btrfs_key key;
+        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        u64 old_total = btrfs_super_total_bytes(super_copy);
+        u64 diff = device->total_bytes - new_size;
+        if (new_size >= device->total_bytes)
+                return -EINVAL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        trans = btrfs_start_transaction(root, 1);
+        if (!trans) {
+                ret = -ENOMEM;
+                goto done;
+        }
+        path->reada = 2;
+        lock_chunks(root);
+        device->total_bytes = new_size;
+        if (device->writeable)
+                device->fs_devices->total_rw_bytes -= diff;
+        ret = btrfs_update_device(trans, device);
+        if (ret) {
+                unlock_chunks(root);
+                btrfs_end_transaction(trans, root);
+                goto done;
+        }
+        WARN_ON(diff > old_total);
+        btrfs_set_super_total_bytes(super_copy, old_total - diff);
+        unlock_chunks(root);
+        btrfs_end_transaction(trans, root);
+        key.objectid = device->devid;
+        key.offset = (u64)-1;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto done;
+                ret = btrfs_previous_item(root, path, 0, key.type);
+                if (ret < 0)
+                        goto done;
+                if (ret) {
+                        ret = 0;
+                        goto done;
+                }
+                l = path->nodes[0];
+                slot = path->slots[0];
+                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+                if (key.objectid != device->devid)
+                        goto done;
+                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+                length = btrfs_dev_extent_length(l, dev_extent);
+                if (key.offset + length <= new_size)
+                        goto done;
+                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+                btrfs_release_path(root, path);
+                ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+                                           chunk_offset);
+                if (ret)
+                        goto done;
+        }
+done:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct btrfs_key *key,
+                           struct btrfs_chunk *chunk, int item_size)
+{
+        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_disk_key disk_key;
+        u32 array_size;
+        u8 *ptr;
+        array_size = btrfs_super_sys_array_size(super_copy);
+        if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+                return -EFBIG;
+        ptr = super_copy->sys_chunk_array + array_size;
+        btrfs_cpu_key_to_disk(&disk_key, key);
+        memcpy(ptr, &disk_key, sizeof(disk_key));
+        ptr += sizeof(disk_key);
+        memcpy(ptr, chunk, item_size);
+        item_size += sizeof(disk_key);
+        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+        return 0;
+}
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
+                                        int num_stripes, int sub_stripes)
+{
+        if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+                return calc_size;
+        else if (type & BTRFS_BLOCK_GROUP_RAID10)
+                return calc_size * (num_stripes / sub_stripes);
+        else
+                return calc_size * num_stripes;
+}
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *extent_root,
+                               struct map_lookup **map_ret,
+                               u64 *num_bytes, u64 *stripe_size,
+                               u64 start, u64 type)
+{
+        struct btrfs_fs_info *info = extent_root->fs_info;
+        struct btrfs_device *device = NULL;
+        struct btrfs_fs_devices *fs_devices = info->fs_devices;
+        struct list_head *cur;
+        struct map_lookup *map = NULL;
+        struct extent_map_tree *em_tree;
+        struct extent_map *em;
+        struct list_head private_devs;
+        int min_stripe_size = 1 * 1024 * 1024;
+        u64 calc_size = 1024 * 1024 * 1024;
+        u64 max_chunk_size = calc_size;
+        u64 min_free;
+        u64 avail;
+        u64 max_avail = 0;
+        u64 dev_offset;
+        int num_stripes = 1;
+        int min_stripes = 1;
+        int sub_stripes = 0;
+        int looped = 0;
+        int ret;
+        int index;
+        int stripe_len = 64 * 1024;
+        if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+            (type & BTRFS_BLOCK_GROUP_DUP)) {
+                WARN_ON(1);
+                type &= ~BTRFS_BLOCK_GROUP_DUP;
+        }
+        if (list_empty(&fs_devices->alloc_list))
+                return -ENOSPC;
+        if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+                num_stripes = fs_devices->rw_devices;
+                min_stripes = 2;
+        }
+        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+                num_stripes = 2;
+                min_stripes = 2;
+        }
+        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+                num_stripes = min_t(u64, 2, fs_devices->rw_devices);
+                if (num_stripes < 2)
+                        return -ENOSPC;
+                min_stripes = 2;
+        }
+        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+                num_stripes = fs_devices->rw_devices;
+                if (num_stripes < 4)
+                        return -ENOSPC;
+                num_stripes &= ~(u32)1;
+                sub_stripes = 2;
+                min_stripes = 4;
+        }
+        if (type & BTRFS_BLOCK_GROUP_DATA) {
+                max_chunk_size = 10 * calc_size;
+                min_stripe_size = 64 * 1024 * 1024;
+        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+                max_chunk_size = 4 * calc_size;
+                min_stripe_size = 32 * 1024 * 1024;
+        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+                calc_size = 8 * 1024 * 1024;
+                max_chunk_size = calc_size * 2;
+                min_stripe_size = 1 * 1024 * 1024;
+        }
+        /* we don't want a chunk larger than 10% of writeable space */
+        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+                             max_chunk_size);
+again:
+        if (!map || map->num_stripes != num_stripes) {
+                kfree(map);
+                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+                if (!map)
+                        return -ENOMEM;
+                map->num_stripes = num_stripes;
+        }
+        if (calc_size * num_stripes > max_chunk_size) {
+                calc_size = max_chunk_size;
+                do_div(calc_size, num_stripes);
+                do_div(calc_size, stripe_len);
+                calc_size *= stripe_len;
+        }
+        /* we don't want tiny stripes */
+        calc_size = max_t(u64, min_stripe_size, calc_size);
+        do_div(calc_size, stripe_len);
+        calc_size *= stripe_len;
+        cur = fs_devices->alloc_list.next;
+        index = 0;
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                min_free = calc_size * 2;
+        else
+                min_free = calc_size;
+        /*
+         * we add 1MB because we never use the first 1MB of the device, unless
+         * we've looped, then we are likely allocating the maximum amount of
+         * space left already
+         */
+        if (!looped)
+                min_free += 1024 * 1024;
+        INIT_LIST_HEAD(&private_devs);
+        while (index < num_stripes) {
+                device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+                BUG_ON(!device->writeable);
+                if (device->total_bytes > device->bytes_used)
+                        avail = device->total_bytes - device->bytes_used;
+                else
+                        avail = 0;
+                cur = cur->next;
+                if (device->in_fs_metadata && avail >= min_free) {
+                        ret = find_free_dev_extent(trans, device,
+                                                   min_free, &dev_offset);
+                        if (ret == 0) {
+                                list_move_tail(&device->dev_alloc_list,
+                                               &private_devs);
+                                map->stripes[index].dev = device;
+                                map->stripes[index].physical = dev_offset;
+                                index++;
+                                if (type & BTRFS_BLOCK_GROUP_DUP) {
+                                        map->stripes[index].dev = device;
+                                        map->stripes[index].physical =
+                                                dev_offset + calc_size;
+                                        index++;
+                                }
+                        }
+                } else if (device->in_fs_metadata && avail > max_avail)
+                        max_avail = avail;
+                if (cur == &fs_devices->alloc_list)
+                        break;
+        }
+        list_splice(&private_devs, &fs_devices->alloc_list);
+        if (index < num_stripes) {
+                if (index >= min_stripes) {
+                        num_stripes = index;
+                        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+                                num_stripes /= sub_stripes;
+                                num_stripes *= sub_stripes;
+                        }
+                        looped = 1;
+                        goto again;
+                }
+                if (!looped && max_avail > 0) {
+                        looped = 1;
+                        calc_size = max_avail;
+                        goto again;
+                }
+                kfree(map);
+                return -ENOSPC;
+        }
+        map->sector_size = extent_root->sectorsize;
+        map->stripe_len = stripe_len;
+        map->io_align = stripe_len;
+        map->io_width = stripe_len;
+        map->type = type;
+        map->num_stripes = num_stripes;
+        map->sub_stripes = sub_stripes;
+        *map_ret = map;
+        *stripe_size = calc_size;
+        *num_bytes = chunk_bytes_by_type(type, calc_size,
+                                         num_stripes, sub_stripes);
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                kfree(map);
+                return -ENOMEM;
+        }
+        em->bdev = (struct block_device *)map;
+        em->start = start;
+        em->len = *num_bytes;
+        em->block_start = 0;
+        em->block_len = em->len;
+        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+        spin_lock(&em_tree->lock);
+        ret = add_extent_mapping(em_tree, em);
+        spin_unlock(&em_tree->lock);
+        BUG_ON(ret);
+        free_extent_map(em);
+        ret = btrfs_make_block_group(trans, extent_root, 0, type,
+                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                     start, *num_bytes);
+        BUG_ON(ret);
+        index = 0;
+        while (index < map->num_stripes) {
+                device = map->stripes[index].dev;
+                dev_offset = map->stripes[index].physical;
+                ret = btrfs_alloc_dev_extent(trans, device,
+                                info->chunk_root->root_key.objectid,
+                                BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                start, dev_offset, calc_size);
+                BUG_ON(ret);
+                index++;
+        }
+        return 0;
+}
+static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *extent_root,
+                                struct map_lookup *map, u64 chunk_offset,
+                                u64 chunk_size, u64 stripe_size)
+{
+        u64 dev_offset;
+        struct btrfs_key key;
+        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+        struct btrfs_device *device;
+        struct btrfs_chunk *chunk;
+        struct btrfs_stripe *stripe;
+        size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+        int index = 0;
+        int ret;
+        chunk = kzalloc(item_size, GFP_NOFS);
+        if (!chunk)
+                return -ENOMEM;
+        index = 0;
+        while (index < map->num_stripes) {
+                device = map->stripes[index].dev;
+                device->bytes_used += stripe_size;
+                ret = btrfs_update_device(trans, device);
+                BUG_ON(ret);
+                index++;
+        }
+        index = 0;
+        stripe = &chunk->stripe;
+        while (index < map->num_stripes) {
+                device = map->stripes[index].dev;
+                dev_offset = map->stripes[index].physical;
+                btrfs_set_stack_stripe_devid(stripe, device->devid);
+                btrfs_set_stack_stripe_offset(stripe, dev_offset);
+                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
+                stripe++;
+                index++;
+        }
+        btrfs_set_stack_chunk_length(chunk, chunk_size);
+        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+        btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+        btrfs_set_stack_chunk_type(chunk, map->type);
+        btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+        btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+        btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
+        btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+        btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+        key.type = BTRFS_CHUNK_ITEM_KEY;
+        key.offset = chunk_offset;
+        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+        BUG_ON(ret);
+        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+                ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+                                             item_size);
+                BUG_ON(ret);
+        }
+        kfree(chunk);
+        return 0;
+}
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *extent_root, u64 type)
+{
+        u64 chunk_offset;
+        u64 chunk_size;
+        u64 stripe_size;
+        struct map_lookup *map;
+        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+        int ret;
+        ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                              &chunk_offset);
+        if (ret)
+                return ret;
+        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+                                  &stripe_size, chunk_offset, type);
+        if (ret)
+                return ret;
+        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+                                   chunk_size, stripe_size);
+        BUG_ON(ret);
+        return 0;
+}
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
+                                         struct btrfs_root *root,
+                                         struct btrfs_device *device)
+{
+        u64 chunk_offset;
+        u64 sys_chunk_offset;
+        u64 chunk_size;
+        u64 sys_chunk_size;
+        u64 stripe_size;
+        u64 sys_stripe_size;
+        u64 alloc_profile;
+        struct map_lookup *map;
+        struct map_lookup *sys_map;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_root *extent_root = fs_info->extent_root;
+        int ret;
+        ret = find_next_chunk(fs_info->chunk_root,
+                              BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+        BUG_ON(ret);
+        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+                        (fs_info->metadata_alloc_profile &
+                         fs_info->avail_metadata_alloc_bits);
+        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+                                  &stripe_size, chunk_offset, alloc_profile);
+        BUG_ON(ret);
+        sys_chunk_offset = chunk_offset + chunk_size;
+        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+                        (fs_info->system_alloc_profile &
+                         fs_info->avail_system_alloc_bits);
+        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+                                  &sys_chunk_size, &sys_stripe_size,
+                                  sys_chunk_offset, alloc_profile);
+        BUG_ON(ret);
+        ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+        BUG_ON(ret);
+        /*
+         * Modifying chunk tree needs allocating new blocks from both
+         * system block group and metadata block group. So we only can
+         * do operations require modifying the chunk tree after both
+         * block groups were created.
+         */
+        ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+                                   chunk_size, stripe_size);
+        BUG_ON(ret);
+        ret = __finish_chunk_alloc(trans, extent_root, sys_map,
+                                   sys_chunk_offset, sys_chunk_size,
+                                   sys_stripe_size);
+        BUG_ON(ret);
+        return 0;
+}
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+        struct extent_map *em;
+        struct map_lookup *map;
+        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+        int readonly = 0;
+        int i;
+        spin_lock(&map_tree->map_tree.lock);
+        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+        spin_unlock(&map_tree->map_tree.lock);
+        if (!em)
+                return 1;
+        map = (struct map_lookup *)em->bdev;
+        for (i = 0; i < map->num_stripes; i++) {
+                if (!map->stripes[i].dev->writeable) {
+                        readonly = 1;
+                        break;
+                }
+        }
+        free_extent_map(em);
+        return readonly;
+}
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+        extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+}
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+        struct extent_map *em;
+        while (1) {
+                spin_lock(&tree->map_tree.lock);
+                em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+                if (em)
+                        remove_extent_mapping(&tree->map_tree, em);
+                spin_unlock(&tree->map_tree.lock);
+                if (!em)
+                        break;
+                kfree(em->bdev);
+                /* once for us */
+                free_extent_map(em);
+                /* once for the tree */
+                free_extent_map(em);
+        }
+}
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+        struct extent_map *em;
+        struct map_lookup *map;
+        struct extent_map_tree *em_tree = &map_tree->map_tree;
+        int ret;
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, logical, len);
+        spin_unlock(&em_tree->lock);
+        BUG_ON(!em);
+        BUG_ON(em->start > logical || em->start + em->len < logical);
+        map = (struct map_lookup *)em->bdev;
+        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+                ret = map->num_stripes;
+        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+                ret = map->sub_stripes;
+        else
+                ret = 1;
+        free_extent_map(em);
+        return ret;
+}
+static int find_live_mirror(struct map_lookup *map, int first, int num,
+                            int optimal)
+{
+        int i;
+        if (map->stripes[optimal].dev->bdev)
+                return optimal;
+        for (i = first; i < first + num; i++) {
+                if (map->stripes[i].dev->bdev)
+                        return i;
+        }
+        /* we couldn't find one that doesn't fail.  Just return something
+         * and the io error handling code will clean up eventually
+         */
+        return optimal;
+}
+static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+                             u64 logical, u64 *length,
+                             struct btrfs_multi_bio **multi_ret,
+                             int mirror_num, struct page *unplug_page)
+{
+        struct extent_map *em;
+        struct map_lookup *map;
+        struct extent_map_tree *em_tree = &map_tree->map_tree;
+        u64 offset;
+        u64 stripe_offset;
+        u64 stripe_nr;
+        int stripes_allocated = 8;
+        int stripes_required = 1;
+        int stripe_index;
+        int i;
+        int num_stripes;
+        int max_errors = 0;
+        struct btrfs_multi_bio *multi = NULL;
+        if (multi_ret && !(rw & (1 << BIO_RW)))
+                stripes_allocated = 1;
+again:
+        if (multi_ret) {
+                multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+                                GFP_NOFS);
+                if (!multi)
+                        return -ENOMEM;
+                atomic_set(&multi->error, 0);
+        }
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, logical, *length);
+        spin_unlock(&em_tree->lock);
+        if (!em && unplug_page)
+                return 0;
+        if (!em) {
+                printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+                       (unsigned long long)logical,
+                       (unsigned long long)*length);
+                BUG();
+        }
+        BUG_ON(em->start > logical || em->start + em->len < logical);
+        map = (struct map_lookup *)em->bdev;
+        offset = logical - em->start;
+        if (mirror_num > map->num_stripes)
+                mirror_num = 0;
+        /* if our multi bio struct is too small, back off and try again */
+        if (rw & (1 << BIO_RW)) {
+                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+                                 BTRFS_BLOCK_GROUP_DUP)) {
+                        stripes_required = map->num_stripes;
+                        max_errors = 1;
+                } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                        stripes_required = map->sub_stripes;
+                        max_errors = 1;
+                }
+        }
+        if (multi_ret && rw == WRITE &&
+            stripes_allocated < stripes_required) {
+                stripes_allocated = map->num_stripes;
+                free_extent_map(em);
+                kfree(multi);
+                goto again;
+        }
+        stripe_nr = offset;
+        /*
+         * stripe_nr counts the total number of stripes we have to stride
+         * to get to this block
+         */
+        do_div(stripe_nr, map->stripe_len);
+        stripe_offset = stripe_nr * map->stripe_len;
+        BUG_ON(offset < stripe_offset);
+        /* stripe_offset is the offset of this block in its stripe*/
+        stripe_offset = offset - stripe_offset;
+        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+                         BTRFS_BLOCK_GROUP_RAID10 |
+                         BTRFS_BLOCK_GROUP_DUP)) {
+                /* we limit the length of each bio to what fits in a stripe */
+                *length = min_t(u64, em->len - offset,
+                              map->stripe_len - stripe_offset);
+        } else {
+                *length = em->len - offset;
+        }
+        if (!multi_ret && !unplug_page)
+                goto out;
+        num_stripes = 1;
+        stripe_index = 0;
+        if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+                if (unplug_page || (rw & (1 << BIO_RW)))
+                        num_stripes = map->num_stripes;
+                else if (mirror_num)
+                        stripe_index = mirror_num - 1;
+                else {
+                        stripe_index = find_live_mirror(map, 0,
+                                            map->num_stripes,
+                                            current->pid % map->num_stripes);
+                }
+        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+                if (rw & (1 << BIO_RW))
+                        num_stripes = map->num_stripes;
+                else if (mirror_num)
+                        stripe_index = mirror_num - 1;
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                int factor = map->num_stripes / map->sub_stripes;
+                stripe_index = do_div(stripe_nr, factor);
+                stripe_index *= map->sub_stripes;
+                if (unplug_page || (rw & (1 << BIO_RW)))
+                        num_stripes = map->sub_stripes;
+                else if (mirror_num)
+                        stripe_index += mirror_num - 1;
+                else {
+                        stripe_index = find_live_mirror(map, stripe_index,
+                                              map->sub_stripes, stripe_index +
+                                              current->pid % map->sub_stripes);
+                }
+        } else {
+                /*
+                 * after this do_div call, stripe_nr is the number of stripes
+                 * on this device we have to walk to find the data, and
+                 * stripe_index is the number of our device in the stripe array
+                 */
+                stripe_index = do_div(stripe_nr, map->num_stripes);
+        }
+        BUG_ON(stripe_index >= map->num_stripes);
+        for (i = 0; i < num_stripes; i++) {
+                if (unplug_page) {
+                        struct btrfs_device *device;
+                        struct backing_dev_info *bdi;
+                        device = map->stripes[stripe_index].dev;
+                        if (device->bdev) {
+                                bdi = blk_get_backing_dev_info(device->bdev);
+                                if (bdi->unplug_io_fn)
+                                        bdi->unplug_io_fn(bdi, unplug_page);
+                        }
+                } else {
+                        multi->stripes[i].physical =
+                                map->stripes[stripe_index].physical +
+                                stripe_offset + stripe_nr * map->stripe_len;
+                        multi->stripes[i].dev = map->stripes[stripe_index].dev;
+                }
+                stripe_index++;
+        }
+        if (multi_ret) {
+                *multi_ret = multi;
+                multi->num_stripes = num_stripes;
+                multi->max_errors = max_errors;
+        }
+out:
+        free_extent_map(em);
+        return 0;
+}
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+                      u64 logical, u64 *length,
+                      struct btrfs_multi_bio **multi_ret, int mirror_num)
+{
+        return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+                                 mirror_num, NULL);
+}
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+                     u64 chunk_start, u64 physical, u64 devid,
+                     u64 **logical, int *naddrs, int *stripe_len)
+{
+        struct extent_map_tree *em_tree = &map_tree->map_tree;
+        struct extent_map *em;
+        struct map_lookup *map;
+        u64 *buf;
+        u64 bytenr;
+        u64 length;
+        u64 stripe_nr;
+        int i, j, nr = 0;
+        spin_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, chunk_start, 1);
+        spin_unlock(&em_tree->lock);
+        BUG_ON(!em || em->start != chunk_start);
+        map = (struct map_lookup *)em->bdev;
+        length = em->len;
+        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+                do_div(length, map->num_stripes / map->sub_stripes);
+        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+                do_div(length, map->num_stripes);
+        buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+        BUG_ON(!buf);
+        for (i = 0; i < map->num_stripes; i++) {
+                if (devid && map->stripes[i].dev->devid != devid)
+                        continue;
+                if (map->stripes[i].physical > physical ||
+                    map->stripes[i].physical + length <= physical)
+                        continue;
+                stripe_nr = physical - map->stripes[i].physical;
+                do_div(stripe_nr, map->stripe_len);
+                if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                        stripe_nr = stripe_nr * map->num_stripes + i;
+                        do_div(stripe_nr, map->sub_stripes);
+                } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                        stripe_nr = stripe_nr * map->num_stripes + i;
+                }
+                bytenr = chunk_start + stripe_nr * map->stripe_len;
+                WARN_ON(nr >= map->num_stripes);
+                for (j = 0; j < nr; j++) {
+                        if (buf[j] == bytenr)
+                                break;
+                }
+                if (j == nr) {
+                        WARN_ON(nr >= map->num_stripes);
+                        buf[nr++] = bytenr;
+                }
+        }
+        for (i = 0; i > nr; i++) {
+                struct btrfs_multi_bio *multi;
+                struct btrfs_bio_stripe *stripe;
+                int ret;
+                length = 1;
+                ret = btrfs_map_block(map_tree, WRITE, buf[i],
+                                      &length, &multi, 0);
+                BUG_ON(ret);
+                stripe = multi->stripes;
+                for (j = 0; j < multi->num_stripes; j++) {
+                        if (stripe->physical >= physical &&
+                            physical < stripe->physical + length)
+                                break;
+                }
+                BUG_ON(j >= multi->num_stripes);
+                kfree(multi);
+        }
+        *logical = buf;
+        *naddrs = nr;
+        *stripe_len = map->stripe_len;
+        free_extent_map(em);
+        return 0;
+}
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+                      u64 logical, struct page *page)
+{
+        u64 length = PAGE_CACHE_SIZE;
+        return __btrfs_map_block(map_tree, READ, logical, &length,
+                                 NULL, 0, page);
+}
+static void end_bio_multi_stripe(struct bio *bio, int err)
+{
+        struct btrfs_multi_bio *multi = bio->bi_private;
+        int is_orig_bio = 0;
+        if (err)
+                atomic_inc(&multi->error);
+        if (bio == multi->orig_bio)
+                is_orig_bio = 1;
+        if (atomic_dec_and_test(&multi->stripes_pending)) {
+                if (!is_orig_bio) {
+                        bio_put(bio);
+                        bio = multi->orig_bio;
+                }
+                bio->bi_private = multi->private;
+                bio->bi_end_io = multi->end_io;
+                /* only send an error to the higher layers if it is
+                 * beyond the tolerance of the multi-bio
+                 */
+                if (atomic_read(&multi->error) > multi->max_errors) {
+                        err = -EIO;
+                } else if (err) {
+                        /*
+                         * this bio is actually up to date, we didn't
+                         * go over the max number of errors
+                         */
+                        set_bit(BIO_UPTODATE, &bio->bi_flags);
+                        err = 0;
+                }
+                kfree(multi);
+                bio_endio(bio, err);
+        } else if (!is_orig_bio) {
+                bio_put(bio);
+        }
+}
+struct async_sched {
+        struct bio *bio;
+        int rw;
+        struct btrfs_fs_info *info;
+        struct btrfs_work work;
+};
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+static noinline int schedule_bio(struct btrfs_root *root,
+                                 struct btrfs_device *device,
+                                 int rw, struct bio *bio)
+{
+        int should_queue = 1;
+        /* don't bother with additional async steps for reads, right now */
+        if (!(rw & (1 << BIO_RW))) {
+                bio_get(bio);
+                submit_bio(rw, bio);
+                bio_put(bio);
+                return 0;
+        }
+        /*
+         * nr_async_bios allows us to reliably return congestion to the
+         * higher layers.  Otherwise, the async bio makes it appear we have
+         * made progress against dirty pages when we've really just put it
+         * on a queue for later
+         */
+        atomic_inc(&root->fs_info->nr_async_bios);
+        WARN_ON(bio->bi_next);
+        bio->bi_next = NULL;
+        bio->bi_rw |= rw;
+        spin_lock(&device->io_lock);
+        if (device->pending_bio_tail)
+                device->pending_bio_tail->bi_next = bio;
+        device->pending_bio_tail = bio;
+        if (!device->pending_bios)
+                device->pending_bios = bio;
+        if (device->running_pending)
+                should_queue = 0;
+        spin_unlock(&device->io_lock);
+        if (should_queue)
+                btrfs_queue_worker(&root->fs_info->submit_workers,
+                                   &device->work);
+        return 0;
+}
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+                  int mirror_num, int async_submit)
+{
+        struct btrfs_mapping_tree *map_tree;
+        struct btrfs_device *dev;
+        struct bio *first_bio = bio;
+        u64 logical = (u64)bio->bi_sector << 9;
+        u64 length = 0;
+        u64 map_length;
+        struct btrfs_multi_bio *multi = NULL;
+        int ret;
+        int dev_nr = 0;
+        int total_devs = 1;
+        length = bio->bi_size;
+        map_tree = &root->fs_info->mapping_tree;
+        map_length = length;
+        ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+                              mirror_num);
+        BUG_ON(ret);
+        total_devs = multi->num_stripes;
+        if (map_length < length) {
+                printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+                       "len %llu\n", (unsigned long long)logical,
+                       (unsigned long long)length,
+                       (unsigned long long)map_length);
+                BUG();
+        }
+        multi->end_io = first_bio->bi_end_io;
+        multi->private = first_bio->bi_private;
+        multi->orig_bio = first_bio;
+        atomic_set(&multi->stripes_pending, multi->num_stripes);
+        while (dev_nr < total_devs) {
+                if (total_devs > 1) {
+                        if (dev_nr < total_devs - 1) {
+                                bio = bio_clone(first_bio, GFP_NOFS);
+                                BUG_ON(!bio);
+                        } else {
+                                bio = first_bio;
+                        }
+                        bio->bi_private = multi;
+                        bio->bi_end_io = end_bio_multi_stripe;
+                }
+                bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+                dev = multi->stripes[dev_nr].dev;
+                BUG_ON(rw == WRITE && !dev->writeable);
+                if (dev && dev->bdev) {
+                        bio->bi_bdev = dev->bdev;
+                        if (async_submit)
+                                schedule_bio(root, dev, rw, bio);
+                        else
+                                submit_bio(rw, bio);
+                } else {
+                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
+                        bio->bi_sector = logical >> 9;
+                        bio_endio(bio, -EIO);
+                }
+                dev_nr++;
+        }
+        if (total_devs == 1)
+                kfree(multi);
+        return 0;
+}
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+                                       u8 *uuid, u8 *fsid)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *cur_devices;
+        cur_devices = root->fs_info->fs_devices;
+        while (cur_devices) {
+                if (!fsid ||
+                    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+                        device = __find_device(&cur_devices->devices,
+                                               devid, uuid);
+                        if (device)
+                                return device;
+                }
+                cur_devices = cur_devices->seed;
+        }
+        return NULL;
+}
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+                                            u64 devid, u8 *dev_uuid)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        device = kzalloc(sizeof(*device), GFP_NOFS);
+        if (!device)
+                return NULL;
+        list_add(&device->dev_list,
+                 &fs_devices->devices);
+        device->barriers = 1;
+        device->dev_root = root->fs_info->dev_root;
+        device->devid = devid;
+        device->work.func = pending_bios_fn;
+        device->fs_devices = fs_devices;
+        fs_devices->num_devices++;
+        spin_lock_init(&device->io_lock);
+        INIT_LIST_HEAD(&device->dev_alloc_list);
+        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+        return device;
+}
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+                          struct extent_buffer *leaf,
+                          struct btrfs_chunk *chunk)
+{
+        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+        struct map_lookup *map;
+        struct extent_map *em;
+        u64 logical;
+        u64 length;
+        u64 devid;
+        u8 uuid[BTRFS_UUID_SIZE];
+        int num_stripes;
+        int ret;
+        int i;
+        logical = key->offset;
+        length = btrfs_chunk_length(leaf, chunk);
+        spin_lock(&map_tree->map_tree.lock);
+        em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+        spin_unlock(&map_tree->map_tree.lock);
+        /* already mapped? */
+        if (em && em->start <= logical && em->start + em->len > logical) {
+                free_extent_map(em);
+                return 0;
+        } else if (em) {
+                free_extent_map(em);
+        }
+        map = kzalloc(sizeof(*map), GFP_NOFS);
+        if (!map)
+                return -ENOMEM;
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em)
+                return -ENOMEM;
+        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+        if (!map) {
+                free_extent_map(em);
+                return -ENOMEM;
+        }
+        em->bdev = (struct block_device *)map;
+        em->start = logical;
+        em->len = length;
+        em->block_start = 0;
+        em->block_len = em->len;
+        map->num_stripes = num_stripes;
+        map->io_width = btrfs_chunk_io_width(leaf, chunk);
+        map->io_align = btrfs_chunk_io_align(leaf, chunk);
+        map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+        map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+        map->type = btrfs_chunk_type(leaf, chunk);
+        map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+        for (i = 0; i < num_stripes; i++) {
+                map->stripes[i].physical =
+                        btrfs_stripe_offset_nr(leaf, chunk, i);
+                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+                read_extent_buffer(leaf, uuid, (unsigned long)
+                                   btrfs_stripe_dev_uuid_nr(chunk, i),
+                                   BTRFS_UUID_SIZE);
+                map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+                                                        NULL);
+                if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+                        kfree(map);
+                        free_extent_map(em);
+                        return -EIO;
+                }
+                if (!map->stripes[i].dev) {
+                        map->stripes[i].dev =
+                                add_missing_dev(root, devid, uuid);
+                        if (!map->stripes[i].dev) {
+                                kfree(map);
+                                free_extent_map(em);
+                                return -EIO;
+                        }
+                }
+                map->stripes[i].dev->in_fs_metadata = 1;
+        }
+        spin_lock(&map_tree->map_tree.lock);
+        ret = add_extent_mapping(&map_tree->map_tree, em);
+        spin_unlock(&map_tree->map_tree.lock);
+        BUG_ON(ret);
+        free_extent_map(em);
+        return 0;
+}
+static int fill_device_from_item(struct extent_buffer *leaf,
+                                 struct btrfs_dev_item *dev_item,
+                                 struct btrfs_device *device)
+{
+        unsigned long ptr;
+        device->devid = btrfs_device_id(leaf, dev_item);
+        device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+        device->type = btrfs_device_type(leaf, dev_item);
+        device->io_align = btrfs_device_io_align(leaf, dev_item);
+        device->io_width = btrfs_device_io_width(leaf, dev_item);
+        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+        ptr = (unsigned long)btrfs_device_uuid(dev_item);
+        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+        return 0;
+}
+static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+{
+        struct btrfs_fs_devices *fs_devices;
+        int ret;
+        mutex_lock(&uuid_mutex);
+        fs_devices = root->fs_info->fs_devices->seed;
+        while (fs_devices) {
+                if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+                        ret = 0;
+                        goto out;
+                }
+                fs_devices = fs_devices->seed;
+        }
+        fs_devices = find_fsid(fsid);
+        if (!fs_devices) {
+                ret = -ENOENT;
+                goto out;
+        }
+        fs_devices = clone_fs_devices(fs_devices);
+        if (IS_ERR(fs_devices)) {
+                ret = PTR_ERR(fs_devices);
+                goto out;
+        }
+        ret = __btrfs_open_devices(fs_devices, FMODE_READ,
+                                   root->fs_info->bdev_holder);
+        if (ret)
+                goto out;
+        if (!fs_devices->seeding) {
+                __btrfs_close_devices(fs_devices);
+                free_fs_devices(fs_devices);
+                ret = -EINVAL;
+                goto out;
+        }
+        fs_devices->seed = root->fs_info->fs_devices->seed;
+        root->fs_info->fs_devices->seed = fs_devices;
+out:
+        mutex_unlock(&uuid_mutex);
+        return ret;
+}
+static int read_one_dev(struct btrfs_root *root,
+                        struct extent_buffer *leaf,
+                        struct btrfs_dev_item *dev_item)
+{
+        struct btrfs_device *device;
+        u64 devid;
+        int ret;
+        u8 fs_uuid[BTRFS_UUID_SIZE];
+        u8 dev_uuid[BTRFS_UUID_SIZE];
+        devid = btrfs_device_id(leaf, dev_item);
+        read_extent_buffer(leaf, dev_uuid,
+                           (unsigned long)btrfs_device_uuid(dev_item),
+                           BTRFS_UUID_SIZE);
+        read_extent_buffer(leaf, fs_uuid,
+                           (unsigned long)btrfs_device_fsid(dev_item),
+                           BTRFS_UUID_SIZE);
+        if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+                ret = open_seed_devices(root, fs_uuid);
+                if (ret && !btrfs_test_opt(root, DEGRADED))
+                        return ret;
+        }
+        device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+        if (!device || !device->bdev) {
+                if (!btrfs_test_opt(root, DEGRADED))
+                        return -EIO;
+                if (!device) {
+                        printk(KERN_WARNING "warning devid %llu missing\n",
+                               (unsigned long long)devid);
+                        device = add_missing_dev(root, devid, dev_uuid);
+                        if (!device)
+                                return -ENOMEM;
+                }
+        }
+        if (device->fs_devices != root->fs_info->fs_devices) {
+                BUG_ON(device->writeable);
+                if (device->generation !=
+                    btrfs_device_generation(leaf, dev_item))
+                        return -EINVAL;
+        }
+        fill_device_from_item(leaf, dev_item, device);
+        device->dev_root = root->fs_info->dev_root;
+        device->in_fs_metadata = 1;
+        if (device->writeable)
+                device->fs_devices->total_rw_bytes += device->total_bytes;
+        ret = 0;
+        return ret;
+}
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
+{
+        struct btrfs_dev_item *dev_item;
+        dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+                                                     dev_item);
+        return read_one_dev(root, buf, dev_item);
+}
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct extent_buffer *sb;
+        struct btrfs_disk_key *disk_key;
+        struct btrfs_chunk *chunk;
+        u8 *ptr;
+        unsigned long sb_ptr;
+        int ret = 0;
+        u32 num_stripes;
+        u32 array_size;
+        u32 len = 0;
+        u32 cur;
+        struct btrfs_key key;
+        sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+                                          BTRFS_SUPER_INFO_SIZE);
+        if (!sb)
+                return -ENOMEM;
+        btrfs_set_buffer_uptodate(sb);
+        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+        array_size = btrfs_super_sys_array_size(super_copy);
+        ptr = super_copy->sys_chunk_array;
+        sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+        cur = 0;
+        while (cur < array_size) {
+                disk_key = (struct btrfs_disk_key *)ptr;
+                btrfs_disk_key_to_cpu(&key, disk_key);
+                len = sizeof(*disk_key); ptr += len;
+                sb_ptr += len;
+                cur += len;
+                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+                        chunk = (struct btrfs_chunk *)sb_ptr;
+                        ret = read_one_chunk(root, &key, sb, chunk);
+                        if (ret)
+                                break;
+                        num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+                        len = btrfs_chunk_item_size(num_stripes);
+                } else {
+                        ret = -EIO;
+                        break;
+                }
+                ptr += len;
+                sb_ptr += len;
+                cur += len;
+        }
+        free_extent_buffer(sb);
+        return ret;
+}
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        int ret;
+        int slot;
+        root = root->fs_info->chunk_root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /* first we search for all of the device items, and then we
+         * read in all of the chunk items.  This way we can create chunk
+         * mappings that reference all of the devices that are afound
+         */
+        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+        key.offset = 0;
+        key.type = 0;
+again:
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        while (1) {
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto error;
+                        break;
+                }
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+                        if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+                                break;
+                        if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+                                struct btrfs_dev_item *dev_item;
+                                dev_item = btrfs_item_ptr(leaf, slot,
+                                                  struct btrfs_dev_item);
+                                ret = read_one_dev(root, leaf, dev_item);
+                                if (ret)
+                                        goto error;
+                        }
+                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+                        struct btrfs_chunk *chunk;
+                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+                        ret = read_one_chunk(root, &found_key, leaf, chunk);
+                        if (ret)
+                                goto error;
+                }
+                path->slots[0]++;
+        }
+        if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+                key.objectid = 0;
+                btrfs_release_path(root, path);
+                goto again;
+        }
+        ret = 0;
+error:
+        btrfs_free_path(path);
+        return ret;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 000000000000..86c44e9ae110
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+#include <linux/bio.h>
+#include "async-thread.h"
+struct buffer_head;
+struct btrfs_device {
+        struct list_head dev_list;
+        struct list_head dev_alloc_list;
+        struct btrfs_fs_devices *fs_devices;
+        struct btrfs_root *dev_root;
+        struct bio *pending_bios;
+        struct bio *pending_bio_tail;
+        int running_pending;
+        u64 generation;
+        int barriers;
+        int writeable;
+        int in_fs_metadata;
+        spinlock_t io_lock;
+        struct block_device *bdev;
+        /* the mode sent to open_bdev_exclusive */
+        fmode_t mode;
+        char *name;
+        /* the internal btrfs device id */
+        u64 devid;
+        /* size of the device */
+        u64 total_bytes;
+        /* bytes used */
+        u64 bytes_used;
+        /* optimal io alignment for this device */
+        u32 io_align;
+        /* optimal io width for this device */
+        u32 io_width;
+        /* minimal io size for this device */
+        u32 sector_size;
+        /* type and info about this device */
+        u64 type;
+        /* physical drive uuid (or lvm uuid) */
+        u8 uuid[BTRFS_UUID_SIZE];
+        struct btrfs_work work;
+};
+struct btrfs_fs_devices {
+        u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+        /* the device with this id has the most recent coyp of the super */
+        u64 latest_devid;
+        u64 latest_trans;
+        u64 num_devices;
+        u64 open_devices;
+        u64 rw_devices;
+        u64 total_rw_bytes;
+        struct block_device *latest_bdev;
+        /* all of the devices in the FS */
+        struct list_head devices;
+        /* devices not currently being allocated */
+        struct list_head alloc_list;
+        struct list_head list;
+        struct btrfs_fs_devices *seed;
+        int seeding;
+        int opened;
+};
+struct btrfs_bio_stripe {
+        struct btrfs_device *dev;
+        u64 physical;
+};
+struct btrfs_multi_bio {
+        atomic_t stripes_pending;
+        bio_end_io_t *end_io;
+        struct bio *orig_bio;
+        void *private;
+        atomic_t error;
+        int max_errors;
+        int num_stripes;
+        struct btrfs_bio_stripe stripes[];
+};
+#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+                            (sizeof(struct btrfs_bio_stripe) * (n)))
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+                           struct btrfs_device *device,
+                           u64 chunk_tree, u64 chunk_objectid,
+                           u64 chunk_offset, u64 start, u64 num_bytes);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+                    u64 logical, u64 *length,
+                    struct btrfs_multi_bio **multi_ret, int mirror_num);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+                     u64 chunk_start, u64 physical, u64 devid,
+                     u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+                      struct btrfs_root *extent_root, u64 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+                  int mirror_num, int async_submit);
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+                       fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
+                          struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
+int btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+                      u64 logical, struct page *page);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+                      struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+                                       u8 *uuid, u8 *fsid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_balance(struct btrfs_root *dev_root);
+void btrfs_unlock_volumes(void);
+void btrfs_lock_volumes(void);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 000000000000..7f332e270894
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+                                void *buffer, size_t size)
+{
+        struct btrfs_dir_item *di;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        int ret = 0;
+        unsigned long data_ptr;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /* lookup the xattr by name */
+        di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+                                strlen(name), 0);
+        if (!di || IS_ERR(di)) {
+                ret = -ENODATA;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        /* if size is 0, that means we want the size of the attr */
+        if (!size) {
+                ret = btrfs_dir_data_len(leaf, di);
+                goto out;
+        }
+        /* now get the data out of our dir_item */
+        if (btrfs_dir_data_len(leaf, di) > size) {
+                ret = -ERANGE;
+                goto out;
+        }
+        data_ptr = (unsigned long)((char *)(di + 1) +
+                                   btrfs_dir_name_len(leaf, di));
+        read_extent_buffer(leaf, buffer, data_ptr,
+                           btrfs_dir_data_len(leaf, di));
+        ret = btrfs_dir_data_len(leaf, di);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int __btrfs_setxattr(struct inode *inode, const char *name,
+                            const void *value, size_t size, int flags)
+{
+        struct btrfs_dir_item *di;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        int ret = 0, mod = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        /* first lets see if we already have this xattr */
+        di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+                                strlen(name), -1);
+        if (IS_ERR(di)) {
+                ret = PTR_ERR(di);
+                goto out;
+        }
+        /* ok we already have this xattr, lets remove it */
+        if (di) {
+                /* if we want create only exit */
+                if (flags & XATTR_CREATE) {
+                        ret = -EEXIST;
+                        goto out;
+                }
+                ret = btrfs_delete_one_dir_name(trans, root, path, di);
+                if (ret)
+                        goto out;
+                btrfs_release_path(root, path);
+                /* if we don't have a value then we are removing the xattr */
+                if (!value) {
+                        mod = 1;
+                        goto out;
+                }
+        } else {
+                btrfs_release_path(root, path);
+                if (flags & XATTR_REPLACE) {
+                        /* we couldn't find the attr to replace */
+                        ret = -ENODATA;
+                        goto out;
+                }
+        }
+        /* ok we have to create a completely new xattr */
+        ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
+                                      value, size, inode->i_ino);
+        if (ret)
+                goto out;
+        mod = 1;
+out:
+        if (mod) {
+                inode->i_ctime = CURRENT_TIME;
+                ret = btrfs_update_inode(trans, root, inode);
+        }
+        btrfs_end_transaction(trans, root);
+        btrfs_free_path(path);
+        return ret;
+}
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        struct btrfs_key key, found_key;
+        struct inode *inode = dentry->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_item *item;
+        struct extent_buffer *leaf;
+        struct btrfs_dir_item *di;
+        int ret = 0, slot, advance;
+        size_t total_size = 0, size_left = size;
+        unsigned long name_ptr;
+        size_t name_len;
+        u32 nritems;
+        /*
+         * ok we want all objects associated with this id.
+         * NOTE: we set key.offset = 0; because we want to start with the
+         * first xattr that we find and walk forward
+         */
+        key.objectid = inode->i_ino;
+        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+        key.offset = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        /* search for our xattrs */
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto err;
+        ret = 0;
+        advance = 0;
+        while (1) {
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                slot = path->slots[0];
+                /* this is where we start walking through the path */
+                if (advance || slot >= nritems) {
+                        /*
+                         * if we've reached the last slot in this leaf we need
+                         * to go to the next leaf and reset everything
+                         */
+                        if (slot >= nritems-1) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret)
+                                        break;
+                                leaf = path->nodes[0];
+                                nritems = btrfs_header_nritems(leaf);
+                                slot = path->slots[0];
+                        } else {
+                                /*
+                                 * just walking through the slots on this leaf
+                                 */
+                                slot++;
+                                path->slots[0]++;
+                        }
+                }
+                advance = 1;
+                item = btrfs_item_nr(leaf, slot);
+                btrfs_item_key_to_cpu(leaf, &found_key, slot);
+                /* check to make sure this item is what we want */
+                if (found_key.objectid != key.objectid)
+                        break;
+                if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+                        break;
+                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+                name_len = btrfs_dir_name_len(leaf, di);
+                total_size += name_len + 1;
+                /* we are just looking for how big our buffer needs to be */
+                if (!size)
+                        continue;
+                if (!buffer || (name_len + 1) > size_left) {
+                        ret = -ERANGE;
+                        goto err;
+                }
+                name_ptr = (unsigned long)(di + 1);
+                read_extent_buffer(leaf, buffer, name_ptr, name_len);
+                buffer[name_len] = '\0';
+                size_left -= name_len + 1;
+                buffer += name_len + 1;
+        }
+        ret = total_size;
+err:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * List of handlers for synthetic system.* attributes.  All real ondisk
+ * attributes are handled directly.
+ */
+struct xattr_handler *btrfs_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+        &btrfs_xattr_acl_access_handler,
+        &btrfs_xattr_acl_default_handler,
+#endif
+        NULL,
+};
+/*
+ * Check if the attribute is in a supported namespace.
+ *
+ * This applied after the check for the synthetic attributes in the system
+ * namespace.
+ */
+static bool btrfs_is_valid_xattr(const char *name)
+{
+        return !strncmp(name, XATTR_SECURITY_PREFIX,
+                        XATTR_SECURITY_PREFIX_LEN) ||
+               !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
+               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+}
+ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+                       void *buffer, size_t size)
+{
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_getxattr(dentry, name, buffer, size);
+        if (!btrfs_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+}
+int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                   size_t size, int flags)
+{
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_setxattr(dentry, name, value, size, flags);
+        if (!btrfs_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        if (size == 0)
+                value = "";  /* empty EA, do not remove */
+        return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
+}
+int btrfs_removexattr(struct dentry *dentry, const char *name)
+{
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_removexattr(dentry, name);
+        if (!btrfs_is_valid_xattr(name))
+                return -EOPNOTSUPP;
+        return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 000000000000..5b1d08f8e68d
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __XATTR__
+#define __XATTR__
+#include <linux/xattr.h>
+extern struct xattr_handler btrfs_xattr_acl_access_handler;
+extern struct xattr_handler btrfs_xattr_acl_default_handler;
+extern struct xattr_handler *btrfs_xattr_handlers[];
+extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+                void *buffer, size_t size);
+extern int __btrfs_setxattr(struct inode *inode, const char *name,
+                const void *value, size_t size, int flags);
+extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+                void *buffer, size_t size);
+extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+                const void *value, size_t size, int flags);
+extern int btrfs_removexattr(struct dentry *dentry, const char *name);
+#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..ecfbce836d32
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include "compression.h"
+/* Plan: call deflate() with avail_in == *sourcelen,
+        avail_out = *dstlen - 12 and flush == Z_FINISH.
+        If it doesn't manage to finish, call it again with
+        avail_in == 0 and avail_out set to the remaining 12
+        bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+struct workspace {
+        z_stream inf_strm;
+        z_stream def_strm;
+        char *buf;
+        struct list_head list;
+};
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+        struct workspace *workspace;
+        int ret;
+        int cpus = num_online_cpus();
+again:
+        spin_lock(&workspace_lock);
+        if (!list_empty(&idle_workspace)) {
+                workspace = list_entry(idle_workspace.next, struct workspace,
+                                       list);
+                list_del(&workspace->list);
+                num_workspace--;
+                spin_unlock(&workspace_lock);
+                return workspace;
+        }
+        spin_unlock(&workspace_lock);
+        if (atomic_read(&alloc_workspace) > cpus) {
+                DEFINE_WAIT(wait);
+                prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+                if (atomic_read(&alloc_workspace) > cpus)
+                        schedule();
+                finish_wait(&workspace_wait, &wait);
+                goto again;
+        }
+        atomic_inc(&alloc_workspace);
+        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+        if (!workspace) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+        if (!workspace->def_strm.workspace) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+        if (!workspace->inf_strm.workspace) {
+                ret = -ENOMEM;
+                goto fail_inflate;
+        }
+        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+        if (!workspace->buf) {
+                ret = -ENOMEM;
+                goto fail_kmalloc;
+        }
+        return workspace;
+fail_kmalloc:
+        vfree(workspace->inf_strm.workspace);
+fail_inflate:
+        vfree(workspace->def_strm.workspace);
+fail:
+        kfree(workspace);
+        atomic_dec(&alloc_workspace);
+        wake_up(&workspace_wait);
+        return ERR_PTR(ret);
+}
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+        spin_lock(&workspace_lock);
+        if (num_workspace < num_online_cpus()) {
+                list_add_tail(&workspace->list, &idle_workspace);
+                num_workspace++;
+                spin_unlock(&workspace_lock);
+                if (waitqueue_active(&workspace_wait))
+                        wake_up(&workspace_wait);
+                return 0;
+        }
+        spin_unlock(&workspace_lock);
+        vfree(workspace->def_strm.workspace);
+        vfree(workspace->inf_strm.workspace);
+        kfree(workspace->buf);
+        kfree(workspace);
+        atomic_dec(&alloc_workspace);
+        if (waitqueue_active(&workspace_wait))
+                wake_up(&workspace_wait);
+        return 0;
+}
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+        struct workspace *workspace;
+        while (!list_empty(&idle_workspace)) {
+                workspace = list_entry(idle_workspace.next, struct workspace,
+                                       list);
+                list_del(&workspace->list);
+                vfree(workspace->def_strm.workspace);
+                vfree(workspace->inf_strm.workspace);
+                kfree(workspace->buf);
+                kfree(workspace);
+                atomic_dec(&alloc_workspace);
+        }
+}
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out)
+{
+        int ret;
+        struct workspace *workspace;
+        char *data_in;
+        char *cpage_out;
+        int nr_pages = 0;
+        struct page *in_page = NULL;
+        struct page *out_page = NULL;
+        int out_written = 0;
+        int in_read = 0;
+        unsigned long bytes_left;
+        *out_pages = 0;
+        *total_out = 0;
+        *total_in = 0;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -1;
+        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+                printk(KERN_WARNING "deflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        workspace->def_strm.total_in = 0;
+        workspace->def_strm.total_out = 0;
+        in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+        data_in = kmap(in_page);
+        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        cpage_out = kmap(out_page);
+        pages[0] = out_page;
+        nr_pages = 1;
+        workspace->def_strm.next_in = data_in;
+        workspace->def_strm.next_out = cpage_out;
+        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+        out_written = 0;
+        in_read = 0;
+        while (workspace->def_strm.total_in < len) {
+                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+                if (ret != Z_OK) {
+                        printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                               ret);
+                        zlib_deflateEnd(&workspace->def_strm);
+                        ret = -1;
+                        goto out;
+                }
+                /* we're making it bigger, give up */
+                if (workspace->def_strm.total_in > 8192 &&
+                    workspace->def_strm.total_in <
+                    workspace->def_strm.total_out) {
+                        ret = -1;
+                        goto out;
+                }
+                /* we need another page for writing out.  Test this
+                 * before the total_in so we will pull in a new page for
+                 * the stream end if required
+                 */
+                if (workspace->def_strm.avail_out == 0) {
+                        kunmap(out_page);
+                        if (nr_pages == nr_dest_pages) {
+                                out_page = NULL;
+                                ret = -1;
+                                goto out;
+                        }
+                        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                        cpage_out = kmap(out_page);
+                        pages[nr_pages] = out_page;
+                        nr_pages++;
+                        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+                        workspace->def_strm.next_out = cpage_out;
+                }
+                /* we're all done */
+                if (workspace->def_strm.total_in >= len)
+                        break;
+                /* we've read in a full page, get a new one */
+                if (workspace->def_strm.avail_in == 0) {
+                        if (workspace->def_strm.total_out > max_out)
+                                break;
+                        bytes_left = len - workspace->def_strm.total_in;
+                        kunmap(in_page);
+                        page_cache_release(in_page);
+                        start += PAGE_CACHE_SIZE;
+                        in_page = find_get_page(mapping,
+                                                start >> PAGE_CACHE_SHIFT);
+                        data_in = kmap(in_page);
+                        workspace->def_strm.avail_in = min(bytes_left,
+                                                           PAGE_CACHE_SIZE);
+                        workspace->def_strm.next_in = data_in;
+                }
+        }
+        workspace->def_strm.avail_in = 0;
+        ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+        zlib_deflateEnd(&workspace->def_strm);
+        if (ret != Z_STREAM_END) {
+                ret = -1;
+                goto out;
+        }
+        if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+                ret = -1;
+                goto out;
+        }
+        ret = 0;
+        *total_out = workspace->def_strm.total_out;
+        *total_in = workspace->def_strm.total_in;
+out:
+        *out_pages = nr_pages;
+        if (out_page)
+                kunmap(out_page);
+        if (in_page) {
+                kunmap(in_page);
+                page_cache_release(in_page);
+        }
+        free_workspace(workspace);
+        return ret;
+}
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                              u64 disk_start,
+                              struct bio_vec *bvec,
+                              int vcnt,
+                              size_t srclen)
+{
+        int ret = 0;
+        int wbits = MAX_WBITS;
+        struct workspace *workspace;
+        char *data_in;
+        size_t total_out = 0;
+        unsigned long page_bytes_left;
+        unsigned long page_in_index = 0;
+        unsigned long page_out_index = 0;
+        struct page *page_out;
+        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                        PAGE_CACHE_SIZE;
+        unsigned long buf_start;
+        unsigned long buf_offset;
+        unsigned long bytes;
+        unsigned long working_bytes;
+        unsigned long pg_offset;
+        unsigned long start_byte;
+        unsigned long current_buf_start;
+        char *kaddr;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -ENOMEM;
+        data_in = kmap(pages_in[page_in_index]);
+        workspace->inf_strm.next_in = data_in;
+        workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+        workspace->inf_strm.total_in = 0;
+        workspace->inf_strm.total_out = 0;
+        workspace->inf_strm.next_out = workspace->buf;
+        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        page_out = bvec[page_out_index].bv_page;
+        page_bytes_left = PAGE_CACHE_SIZE;
+        pg_offset = 0;
+        /* If it's deflate, and it's got no preset dictionary, then
+           we can tell zlib to skip the adler32 check. */
+        if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+            ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+            !(((data_in[0]<<8) + data_in[1]) % 31)) {
+                wbits = -((data_in[0] >> 4) + 8);
+                workspace->inf_strm.next_in += 2;
+                workspace->inf_strm.avail_in -= 2;
+        }
+        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+                printk(KERN_WARNING "inflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        while (workspace->inf_strm.total_in < srclen) {
+                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                if (ret != Z_OK && ret != Z_STREAM_END)
+                        break;
+                /*
+                 * buf start is the byte offset we're of the start of
+                 * our workspace buffer
+                 */
+                buf_start = total_out;
+                /* total_out is the last byte of the workspace buffer */
+                total_out = workspace->inf_strm.total_out;
+                working_bytes = total_out - buf_start;
+                /*
+                 * start byte is the first byte of the page we're currently
+                 * copying into relative to the start of the compressed data.
+                 */
+                start_byte = page_offset(page_out) - disk_start;
+                if (working_bytes == 0) {
+                        /* we didn't make progress in this inflate
+                         * call, we're done
+                         */
+                        if (ret != Z_STREAM_END)
+                                ret = -1;
+                        break;
+                }
+                /* we haven't yet hit data corresponding to this page */
+                if (total_out <= start_byte)
+                        goto next;
+                /*
+                 * the start of the data we care about is offset into
+                 * the middle of our working buffer
+                 */
+                if (total_out > start_byte && buf_start < start_byte) {
+                        buf_offset = start_byte - buf_start;
+                        working_bytes -= buf_offset;
+                } else {
+                        buf_offset = 0;
+                }
+                current_buf_start = buf_start;
+                /* copy bytes from the working buffer into the pages */
+                while (working_bytes > 0) {
+                        bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                                    PAGE_CACHE_SIZE - buf_offset);
+                        bytes = min(bytes, working_bytes);
+                        kaddr = kmap_atomic(page_out, KM_USER0);
+                        memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+                               bytes);
+                        kunmap_atomic(kaddr, KM_USER0);
+                        flush_dcache_page(page_out);
+                        pg_offset += bytes;
+                        page_bytes_left -= bytes;
+                        buf_offset += bytes;
+                        working_bytes -= bytes;
+                        current_buf_start += bytes;
+                        /* check if we need to pick another page */
+                        if (page_bytes_left == 0) {
+                                page_out_index++;
+                                if (page_out_index >= vcnt) {
+                                        ret = 0;
+                                        goto done;
+                                }
+                                page_out = bvec[page_out_index].bv_page;
+                                pg_offset = 0;
+                                page_bytes_left = PAGE_CACHE_SIZE;
+                                start_byte = page_offset(page_out) - disk_start;
+                                /*
+                                 * make sure our new page is covered by this
+                                 * working buffer
+                                 */
+                                if (total_out <= start_byte)
+                                        goto next;
+                                /* the next page in the biovec might not
+                                 * be adjacent to the last page, but it
+                                 * might still be found inside this working
+                                 * buffer.  bump our offset pointer
+                                 */
+                                if (total_out > start_byte &&
+                                    current_buf_start < start_byte) {
+                                        buf_offset = start_byte - buf_start;
+                                        working_bytes = total_out - start_byte;
+                                        current_buf_start = buf_start +
+                                                buf_offset;
+                                }
+                        }
+                }
+next:
+                workspace->inf_strm.next_out = workspace->buf;
+                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+                if (workspace->inf_strm.avail_in == 0) {
+                        unsigned long tmp;
+                        kunmap(pages_in[page_in_index]);
+                        page_in_index++;
+                        if (page_in_index >= total_pages_in) {
+                                data_in = NULL;
+                                break;
+                        }
+                        data_in = kmap(pages_in[page_in_index]);
+                        workspace->inf_strm.next_in = data_in;
+                        tmp = srclen - workspace->inf_strm.total_in;
+                        workspace->inf_strm.avail_in = min(tmp,
+                                                           PAGE_CACHE_SIZE);
+                }
+        }
+        if (ret != Z_STREAM_END)
+                ret = -1;
+        else
+                ret = 0;
+done:
+        zlib_inflateEnd(&workspace->inf_strm);
+        if (data_in)
+                kunmap(pages_in[page_in_index]);
+out:
+        free_workspace(workspace);
+        return ret;
+}
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen)
+{
+        int ret = 0;
+        int wbits = MAX_WBITS;
+        struct workspace *workspace;
+        unsigned long bytes_left = destlen;
+        unsigned long total_out = 0;
+        char *kaddr;
+        if (destlen > PAGE_CACHE_SIZE)
+                return -ENOMEM;
+        workspace = find_zlib_workspace();
+        if (!workspace)
+                return -ENOMEM;
+        workspace->inf_strm.next_in = data_in;
+        workspace->inf_strm.avail_in = srclen;
+        workspace->inf_strm.total_in = 0;
+        workspace->inf_strm.next_out = workspace->buf;
+        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        workspace->inf_strm.total_out = 0;
+        /* If it's deflate, and it's got no preset dictionary, then
+           we can tell zlib to skip the adler32 check. */
+        if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+            ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+            !(((data_in[0]<<8) + data_in[1]) % 31)) {
+                wbits = -((data_in[0] >> 4) + 8);
+                workspace->inf_strm.next_in += 2;
+                workspace->inf_strm.avail_in -= 2;
+        }
+        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+                printk(KERN_WARNING "inflateInit failed\n");
+                ret = -1;
+                goto out;
+        }
+        while (bytes_left > 0) {
+                unsigned long buf_start;
+                unsigned long buf_offset;
+                unsigned long bytes;
+                unsigned long pg_offset = 0;
+                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+                if (ret != Z_OK && ret != Z_STREAM_END)
+                        break;
+                buf_start = total_out;
+                total_out = workspace->inf_strm.total_out;
+                if (total_out == buf_start) {
+                        ret = -1;
+                        break;
+                }
+                if (total_out <= start_byte)
+                        goto next;
+                if (total_out > start_byte && buf_start < start_byte)
+                        buf_offset = start_byte - buf_start;
+                else
+                        buf_offset = 0;
+                bytes = min(PAGE_CACHE_SIZE - pg_offset,
+                            PAGE_CACHE_SIZE - buf_offset);
+                bytes = min(bytes, bytes_left);
+                kaddr = kmap_atomic(dest_page, KM_USER0);
+                memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+                kunmap_atomic(kaddr, KM_USER0);
+                pg_offset += bytes;
+                bytes_left -= bytes;
+next:
+                workspace->inf_strm.next_out = workspace->buf;
+                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+        }
+        if (ret != Z_STREAM_END && bytes_left != 0)
+                ret = -1;
+        else
+                ret = 0;
+        zlib_inflateEnd(&workspace->inf_strm);
+out:
+        free_workspace(workspace);
+        return ret;
+}
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}
diff --git a/fs/buffer.c b/fs/buffer.c
index 10179cfa1152..b6e8b8632e2f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -99,10 +99,18 @@ __clear_page_buffers(struct page *page)
        page_cache_release(page);
 }
+static int quiet_error(struct buffer_head *bh)
+{
+        if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
+                return 0;
+        return 1;
+}
 static void buffer_io_error(struct buffer_head *bh)
 {
        char b[BDEVNAME_SIZE];
        printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
                        bdevname(bh->b_bdev, b),
                        (unsigned long long)bh->b_blocknr);
@@ -144,7 +152,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+                if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
@@ -195,10 +203,25 @@ int fsync_bdev(struct block_device *bdev)
 * happen on bdev until thaw_bdev() is called.
 * If a superblock is found on this device, we take the s_umount semaphore
 * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
 */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
        struct super_block *sb;
+        int error = 0;
+        mutex_lock(&bdev->bd_fsfreeze_mutex);
+        if (bdev->bd_fsfreeze_count > 0) {
+                bdev->bd_fsfreeze_count++;
+                sb = get_super(bdev);
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                return sb;
+        }
+        bdev->bd_fsfreeze_count++;
        down(&bdev->bd_mount_sem);
        sb = get_super(bdev);
@@ -213,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev)
                sync_blockdev(sb->s_bdev);
-                if (sb->s_op->write_super_lockfs)
+                if (sb->s_op->freeze_fs) {
-                        sb->s_op->write_super_lockfs(sb);
+                        error = sb->s_op->freeze_fs(sb);
+                        if (error) {
+                                printk(KERN_ERR
+                                        "VFS:Filesystem freeze failed\n");
+                                sb->s_frozen = SB_UNFROZEN;
+                                drop_super(sb);
+                                up(&bdev->bd_mount_sem);
+                                bdev->bd_fsfreeze_count--;
+                                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                                return ERR_PTR(error);
+                        }
+                }
        }
        sync_blockdev(bdev);
+        mutex_unlock(&bdev->bd_fsfreeze_mutex);
        return sb;      /* thaw_bdev releases s->s_umount and bd_mount_sem */
 }
 EXPORT_SYMBOL(freeze_bdev);
@@ -229,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev);
 *
 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
 */
-void thaw_bdev(struct block_device *bdev, struct super_block *sb)
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
+        int error = 0;
+        mutex_lock(&bdev->bd_fsfreeze_mutex);
+        if (!bdev->bd_fsfreeze_count) {
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                return -EINVAL;
+        }
+        bdev->bd_fsfreeze_count--;
+        if (bdev->bd_fsfreeze_count > 0) {
+                if (sb)
+                        drop_super(sb);
+                mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                return 0;
+        }
        if (sb) {
                BUG_ON(sb->s_bdev != bdev);
+                if (!(sb->s_flags & MS_RDONLY)) {
-                if (sb->s_op->unlockfs)
+                        if (sb->s_op->unfreeze_fs) {
-                        sb->s_op->unlockfs(sb);
+                                error = sb->s_op->unfreeze_fs(sb);
-                sb->s_frozen = SB_UNFROZEN;
+                                if (error) {
-                smp_wmb();
+                                        printk(KERN_ERR
-                wake_up(&sb->s_wait_unfrozen);
+                                                "VFS:Filesystem thaw failed\n");
+                                        sb->s_frozen = SB_FREEZE_TRANS;
+                                        bdev->bd_fsfreeze_count++;
+                                        mutex_unlock(&bdev->bd_fsfreeze_mutex);
+                                        return error;
+                                }
+                        }
+                        sb->s_frozen = SB_UNFROZEN;
+                        smp_wmb();
+                        wake_up(&sb->s_wait_unfrozen);
+                }
                drop_super(sb);
        }
        up(&bdev->bd_mount_sem);
+        mutex_unlock(&bdev->bd_fsfreeze_mutex);
+        return 0;
 }
 EXPORT_SYMBOL(thaw_bdev);
@@ -394,7 +458,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
                set_buffer_uptodate(bh);
        } else {
                clear_buffer_uptodate(bh);
-                if (printk_ratelimit())
+                if (!quiet_error(bh))
                        buffer_io_error(bh);
                SetPageError(page);
        }
@@ -455,7 +519,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (printk_ratelimit()) {
+                if (!quiet_error(bh)) {
                        buffer_io_error(bh);
                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
@@ -1988,7 +2052,7 @@ int block_write_begin(struct file *file, struct address_space *mapping,
        page = *pagep;
        if (page == NULL) {
                ownpage = 1;
-                page = __grab_cache_page(mapping, index);
+                page = grab_cache_page_write_begin(mapping, index, flags);
                if (!page) {
                        status = -ENOMEM;
                        goto out;
@@ -2014,7 +2078,6 @@ int block_write_begin(struct file *file, struct address_space *mapping,
                        if (pos + len > inode->i_size)
                                vmtruncate(inode, inode->i_size);
                }
-                goto out;
        }
 out:
@@ -2494,7 +2557,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
@@ -2913,6 +2976,9 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
                set_bit(BH_Eopnotsupp, &bh->b_state);
        }
+        if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+                set_bit(BH_Quiet, &bh->b_state);
        bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
        bio_put(bio);
 }
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 700697a72618..38f71222a552 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
        cd->major = major;
        cd->baseminor = baseminor;
        cd->minorct = minorct;
-        strncpy(cd->name,name, 64);
+        strlcpy(cd->name, name, sizeof(cd->name));
        i = major_to_index(major);
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 6ba43fb346fb..9948c0030e86 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
          link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \
+          md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
          readdir.o ioctl.o sess.o export.o cifsacl.o
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0005a194a75c..13ea53251dcf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
        .llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
        .setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = {
        .readdir = cifs_readdir,
        .release = cifs_closedir,
        .read    = generic_read_dir,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-        .dir_notify = cifs_dir_notify,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
        .unlocked_ioctl  = cifs_ioctl,
        .llseek = generic_file_llseek,
 };
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2ce04c73d74e..7ac481841f87 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
-extern int cifs_dir_notify(struct file *, unsigned long arg);
 /* Functions related to dir entries */
 extern struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
deleted file mode 100644
index 5a57581eb4b2..000000000000
--- a/fs/cifs/fcntl.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *   fs/cifs/fcntl.c
- *
- *   vfs operations that deal with the file control API
- *
- *   Copyright (C) International Business Machines  Corp., 2003,2004
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include "cifsglob.h"
-#include "cifsproto.h"
-#include "cifs_unicode.h"
-#include "cifs_debug.h"
-#include "cifsfs.h"
-static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
-{
-        __u32 cifs_ntfy_flags = 0;
-        /* No way on Linux VFS to ask to monitor xattr
-        changes (and no stream support either */
-        if (fcntl_notify_flags & DN_ACCESS)
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
-        if (fcntl_notify_flags & DN_MODIFY) {
-                /* What does this mean on directories? */
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
-                        FILE_NOTIFY_CHANGE_SIZE;
-        }
-        if (fcntl_notify_flags & DN_CREATE) {
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
-                        FILE_NOTIFY_CHANGE_LAST_WRITE;
-        }
-        if (fcntl_notify_flags & DN_DELETE)
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
-        if (fcntl_notify_flags & DN_RENAME) {
-                /* BB review this - checking various server behaviors */
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
-                        FILE_NOTIFY_CHANGE_FILE_NAME;
-        }
-        if (fcntl_notify_flags & DN_ATTRIB) {
-                cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY |
-                        FILE_NOTIFY_CHANGE_ATTRIBUTES;
-        }
-/*      if (fcntl_notify_flags & DN_MULTISHOT) {
-                cifs_ntfy_flags |= ;
-        } */ /* BB fixme - not sure how to handle this with CIFS yet */
-        return cifs_ntfy_flags;
-}
-int cifs_dir_notify(struct file *file, unsigned long arg)
-{
-        int xid;
-        int rc = -EINVAL;
-        int oplock = 0;
-        struct cifs_sb_info *cifs_sb;
-        struct cifsTconInfo *pTcon;
-        char *full_path = NULL;
-        __u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES;
-        __u16 netfid;
-        if (experimEnabled == 0)
-                return 0;
-        xid = GetXid();
-        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-        pTcon = cifs_sb->tcon;
-        full_path = build_path_from_dentry(file->f_path.dentry);
-        if (full_path == NULL) {
-                rc = -ENOMEM;
-        } else {
-                cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg));
-                rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-                        GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
-                        &netfid, &oplock, NULL, cifs_sb->local_nls,
-                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                /* BB fixme - add this handle to a notify handle list */
-                if (rc) {
-                        cFYI(1, ("Could not open directory for notify"));
-                } else {
-                        filter = convert_to_cifs_notify_flags(arg);
-                        if (filter != 0) {
-                                rc = CIFSSMBNotify(xid, pTcon,
-                                        0 /* no subdirs */, netfid,
-                                        filter, file, arg & DN_MULTISHOT,
-                                        cifs_sb->local_nls);
-                        } else {
-                                rc = -EINVAL;
-                        }
-                        /* BB add code to close file eventually (at unmount
-                        it would close automatically but may be a way
-                        to do it easily when inode freed or when
-                        notify info is cleared/changed */
-                        cFYI(1, ("notify rc %d", rc));
-                }
-        }
-        FreeXid(xid);
-        return rc;
-}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b1e1fc6a6e6a..12bb656fbe75 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
        cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                rc = -ENOMEM;
                goto out;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4edc..5ab9896fdcb2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1641,7 +1641,7 @@ do_expand:
        i_size_write(inode, offset);
        spin_unlock(&inode->i_lock);
 out_truncate:
-        if (inode->i_op && inode->i_op->truncate)
+        if (inode->i_op->truncate)
                inode->i_op->truncate(inode);
        return 0;
 out_sig:
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df6..6a347fbc998a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 {
        struct file *host_file;
-        struct dentry *host_dentry;
+        struct inode *coda_inode = coda_dentry->d_inode;
-        struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
        struct coda_file_info *cfi;
        int err = 0;
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
-        if (host_file->f_op && host_file->f_op->fsync) {
+        err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
-                host_dentry = host_file->f_path.dentry;
-                host_inode = host_dentry->d_inode;
-                mutex_lock(&host_inode->i_mutex);
-                err = host_file->f_op->fsync(host_file, host_dentry, datasync);
-                mutex_unlock(&host_inode->i_mutex);
-        }
        if ( !err && !datasync ) {
                lock_kernel();
                err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c6465..43c96ce29614 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
 #include "coda_int.h"
+#ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
+#endif
 static ctl_table coda_table[] = {
        {
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
        {}
 };
+#ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
        {
                .ctl_name       = CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
        },
        {}
 };
+#endif
 void coda_sysctl_init(void)
 {
diff --git a/fs/compat.c b/fs/compat.c
index d1ece79b6411..30f2faa22f5c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
        ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
 out:
+        if (ret > 0)
+                add_rchar(current, ret);
+        inc_syscr(current);
        fput(file);
        return ret;
 }
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
        ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
 out:
+        if (ret > 0)
+                add_wchar(current, ret);
+        inc_syscw(current);
        fput(file);
        return ret;
 }
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc94480..5d349d38e056 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
        inode->i_mode = mode;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
        struct inode * inode = new_inode(configfs_sb);
        if (inode) {
-                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &configfs_aops;
                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a14..a07338d2d140 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_data.a_ops = &cramfs_aops;
                } else {
-                        inode->i_size = 0;
-                        inode->i_blocks = 0;
                        init_special_inode(inode, inode->i_mode,
                                old_decode_dev(cramfs_inode->size));
                }
diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7f3e66..4547f66884a0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -34,7 +34,6 @@
 #include <linux/bootmem.h>
 #include "internal.h"
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
@@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
        dentry->d_mounted = 0;
-#ifdef CONFIG_PROFILING
-        dentry->d_cookie = NULL;
-#endif
        INIT_HLIST_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -1336,7 +1332,7 @@ err_out:
 *
 * Searches the children of the parent dentry for the name in question. If
 * the dentry is found its reference count is incremented and the dentry
- * is returned. The caller must use d_put to free the entry when it has
+ * is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned on failure.
 *
 * __d_lookup is dcache_lock free. The hash list is protected using RCU.
@@ -1571,10 +1567,6 @@ void d_rehash(struct dentry * entry)
        spin_unlock(&dcache_lock);
 }
-#define do_switch(x,y) do { \
-        __typeof__ (x) __tmp = x; \
-        x = y; y = __tmp; } while (0)
 /*
 * When switching names, the actual string doesn't strictly have to
 * be preserved in the target - because we're dropping the target
@@ -1593,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                        /*
                         * Both external: swap the pointers
                         */
-                        do_switch(target->d_name.name, dentry->d_name.name);
+                        swap(target->d_name.name, dentry->d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
@@ -1620,8 +1612,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
                         */
                        memcpy(dentry->d_iname, target->d_name.name,
                                        target->d_name.len + 1);
+                        dentry->d_name.len = target->d_name.len;
+                        return;
                }
        }
+        swap(dentry->d_name.len, target->d_name.len);
 }
 /*
@@ -1681,8 +1676,7 @@ already_unhashed:
        /* Switch the names.. */
        switch_names(dentry, target);
-        do_switch(dentry->d_name.len, target->d_name.len);
+        swap(dentry->d_name.hash, target->d_name.hash);
-        do_switch(dentry->d_name.hash, target->d_name.hash);
        /* ... and switch the parents */
        if (IS_ROOT(dentry)) {
@@ -1690,7 +1684,7 @@ already_unhashed:
                target->d_parent = target;
                INIT_LIST_HEAD(&target->d_u.d_child);
        } else {
-                do_switch(dentry->d_parent, target->d_parent);
+                swap(dentry->d_parent, target->d_parent);
                /* And add them back to the (new) parent lists */
                list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1791,8 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
        struct dentry *dparent, *aparent;
        switch_names(dentry, anon);
-        do_switch(dentry->d_name.len, anon->d_name.len);
+        swap(dentry->d_name.hash, anon->d_name.hash);
-        do_switch(dentry->d_name.hash, anon->d_name.hash);
        dparent = dentry->d_parent;
        aparent = anon->d_parent;
@@ -1911,7 +1904,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
 *
 * "buflen" should be positive. Caller holds the dcache_lock.
 *
@@ -1987,7 +1981,10 @@ Elong:
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the path was
+ * too long. Note: Callers should use the returned pointer, not the passed
+ * in buffer, to use the name! The implementation often starts at an offset
+ * into the buffer, and may leave 0 bytes at the start.
 *
 * "buflen" should be positive.
 */
@@ -2313,9 +2310,6 @@ static void __init dcache_init(void)
 /* SLAB cache for __getname() consumers */
 struct kmem_cache *names_cachep __read_mostly;
-/* SLAB cache for file structures */
-struct kmem_cache *filp_cachep __read_mostly;
 EXPORT_SYMBOL(d_genocide);
 void __init vfs_caches_init_early(void)
@@ -2337,9 +2331,6 @@ void __init vfs_caches_init(unsigned long mempages)
        names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
        dcache_init();
        inode_init();
        files_init(mempages);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 855d4b1d619a..180e9fec4ad8 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
 {
        struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
                                                        GFP_KERNEL);
+        struct dentry *d;
        if (!dcs)
                return NULL;
-        path->dentry->d_cookie = dcs;
+        d = path->dentry;
+        spin_lock(&d->d_lock);
+        d->d_flags |= DCACHE_COOKIE;
+        spin_unlock(&d->d_lock);
        dcs->path = *path;
        path_get(path);
        hash_dcookie(dcs);
@@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie)
                goto out;
        }
-        dcs = path->dentry->d_cookie;
+        if (path->dentry->d_flags & DCACHE_COOKIE) {
+                dcs = find_dcookie((unsigned long)path->dentry);
-        if (!dcs)
+        } else {
                dcs = alloc_dcookie(path);
+                if (!dcs) {
-        if (!dcs) {
+                        err = -ENOMEM;
-                err = -ENOMEM;
+                        goto out;
-                goto out;
+                }
        }
        *cookie = dcookie_value(dcs);
@@ -251,7 +256,12 @@ out_kmem:
 static void free_dcookie(struct dcookie_struct * dcs)
 {
-        dcs->path.dentry->d_cookie = NULL;
+        struct dentry *d = dcs->path.dentry;
+        spin_lock(&d->d_lock);
+        d->d_flags &= ~DCACHE_COOKIE;
+        spin_unlock(&d->d_lock);
        path_put(&dcs->path);
        kmem_cache_free(dcookie_cache, dcs);
 }
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 159a5efd6a8a..33a90120f6ad 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
+static int debugfs_size_t_set(void *data, u64 val)
+{
+        *(size_t *)data = val;
+        return 0;
+}
+static int debugfs_size_t_get(void *data, u64 *val)
+{
+        *val = *(size_t *)data;
+        return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+                        "%llu\n");      /* %llu and %zu are more or less the same */
+/**
+ * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+                                     struct dentry *parent, size_t *value)
+{
+        return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_size_t);
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
                              size_t count, loff_t *ppos)
 {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf36..81ae9ea3c6e1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
        if (inode) {
                inode->i_mode = mode;
-                inode->i_uid = 0;
-                inode->i_gid = 0;
-                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                switch (mode & S_IFMT) {
                default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5d61b7c06e13..5f3231b9633f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,25 +27,32 @@
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 #define DEVPTS_DEFAULT_MODE 0600
+/*
+ * ptmx is a new node in /dev/pts and will be unused in legacy (single-
+ * instance) mode. To prevent surprises in user space, set permissions of
+ * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
+ * permissions.
+ */
+#define DEVPTS_DEFAULT_PTMX_MODE 0000
 #define PTMX_MINOR      2
 extern int pty_limit;                   /* Config limit on Unix98 ptys */
-static DEFINE_IDA(allocated_ptys);
 static DEFINE_MUTEX(allocated_ptys_lock);
 static struct vfsmount *devpts_mnt;
-static struct dentry *devpts_root;
-static struct {
+struct pts_mount_opts {
        int setuid;
        int setgid;
        uid_t   uid;
        gid_t   gid;
        umode_t mode;
-} config = {.mode = DEVPTS_DEFAULT_MODE};
+        umode_t ptmxmode;
+        int newinstance;
+};
 enum {
-        Opt_uid, Opt_gid, Opt_mode,
+        Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,
        Opt_err
 };
@@ -53,18 +60,50 @@ static const match_table_t tokens = {
        {Opt_uid, "uid=%u"},
        {Opt_gid, "gid=%u"},
        {Opt_mode, "mode=%o"},
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+        {Opt_ptmxmode, "ptmxmode=%o"},
+        {Opt_newinstance, "newinstance"},
+#endif
        {Opt_err, NULL}
 };
-static int devpts_remount(struct super_block *sb, int *flags, char *data)
+struct pts_fs_info {
+        struct ida allocated_ptys;
+        struct pts_mount_opts mount_opts;
+        struct dentry *ptmx_dentry;
+};
+static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+{
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+        if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+                return inode->i_sb;
+#endif
+        return devpts_mnt->mnt_sb;
+}
+#define PARSE_MOUNT     0
+#define PARSE_REMOUNT   1
+static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 {
        char *p;
-        config.setuid  = 0;
+        opts->setuid  = 0;
-        config.setgid  = 0;
+        opts->setgid  = 0;
-        config.uid     = 0;
+        opts->uid     = 0;
-        config.gid     = 0;
+        opts->gid     = 0;
-        config.mode    = DEVPTS_DEFAULT_MODE;
+        opts->mode    = DEVPTS_DEFAULT_MODE;
+        opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+        /* newinstance makes sense only on initial mount */
+        if (op == PARSE_MOUNT)
+                opts->newinstance = 0;
        while ((p = strsep(&data, ",")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
@@ -79,20 +118,32 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
                case Opt_uid:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
-                        config.uid = option;
+                        opts->uid = option;
-                        config.setuid = 1;
+                        opts->setuid = 1;
                        break;
                case Opt_gid:
                        if (match_int(&args[0], &option))
                                return -EINVAL;
-                        config.gid = option;
+                        opts->gid = option;
-                        config.setgid = 1;
+                        opts->setgid = 1;
                        break;
                case Opt_mode:
                        if (match_octal(&args[0], &option))
                                return -EINVAL;
-                        config.mode = option & S_IALLUGO;
+                        opts->mode = option & S_IALLUGO;
+                        break;
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+                case Opt_ptmxmode:
+                        if (match_octal(&args[0], &option))
+                                return -EINVAL;
+                        opts->ptmxmode = option & S_IALLUGO;
+                        break;
+                case Opt_newinstance:
+                        /* newinstance makes sense only on initial mount */
+                        if (op == PARSE_MOUNT)
+                                opts->newinstance = 1;
                        break;
+#endif
                default:
                        printk(KERN_ERR "devpts: called with bogus options\n");
                        return -EINVAL;
@@ -102,13 +153,106 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int mknod_ptmx(struct super_block *sb)
+{
+        int mode;
+        int rc = -ENOMEM;
+        struct dentry *dentry;
+        struct inode *inode;
+        struct dentry *root = sb->s_root;
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
+        mutex_lock(&root->d_inode->i_mutex);
+        /* If we have already created ptmx node, return */
+        if (fsi->ptmx_dentry) {
+                rc = 0;
+                goto out;
+        }
+        dentry = d_alloc_name(root, "ptmx");
+        if (!dentry) {
+                printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n");
+                goto out;
+        }
+        /*
+         * Create a new 'ptmx' node in this mount of devpts.
+         */
+        inode = new_inode(sb);
+        if (!inode) {
+                printk(KERN_ERR "Unable to alloc inode for ptmx node\n");
+                dput(dentry);
+                goto out;
+        }
+        inode->i_ino = 2;
+        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        mode = S_IFCHR|opts->ptmxmode;
+        init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+        d_add(dentry, inode);
+        fsi->ptmx_dentry = dentry;
+        rc = 0;
+        printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
+                        inode->i_ino);
+out:
+        mutex_unlock(&root->d_inode->i_mutex);
+        return rc;
+}
+static void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+        struct inode *inode;
+        if (fsi->ptmx_dentry) {
+                inode = fsi->ptmx_dentry->d_inode;
+                inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
+        }
+}
+#else
+static inline void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+       return;
+}
+#endif
+static int devpts_remount(struct super_block *sb, int *flags, char *data)
+{
+        int err;
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
+        err = parse_mount_options(data, PARSE_REMOUNT, opts);
+        /*
+         * parse_mount_options() restores options to default values
+         * before parsing and may have changed ptmxmode. So, update the
+         * mode in the inode too. Bogus options don't fail the remount,
+         * so do this even on error return.
+         */
+        update_ptmx_mode(fsi);
+        return err;
+}
 static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
-        if (config.setuid)
+        struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
-                seq_printf(seq, ",uid=%u", config.uid);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
-        if (config.setgid)
-                seq_printf(seq, ",gid=%u", config.gid);
+        if (opts->setuid)
-        seq_printf(seq, ",mode=%03o", config.mode);
+                seq_printf(seq, ",uid=%u", opts->uid);
+        if (opts->setgid)
+                seq_printf(seq, ",gid=%u", opts->gid);
+        seq_printf(seq, ",mode=%03o", opts->mode);
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+        seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
+#endif
        return 0;
 }
@@ -119,10 +263,25 @@ static const struct super_operations devpts_sops = {
        .show_options   = devpts_show_options,
 };
+static void *new_pts_fs_info(void)
+{
+        struct pts_fs_info *fsi;
+        fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+        if (!fsi)
+                return NULL;
+        ida_init(&fsi->allocated_ptys);
+        fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+        fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+        return fsi;
+}
 static int
 devpts_fill_super(struct super_block *s, void *data, int silent)
 {
-        struct inode * inode;
+        struct inode *inode;
        s->s_blocksize = 1024;
        s->s_blocksize_bits = 10;
@@ -130,39 +289,240 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
        s->s_op = &devpts_sops;
        s->s_time_gran = 1;
+        s->s_fs_info = new_pts_fs_info();
+        if (!s->s_fs_info)
+                goto fail;
        inode = new_inode(s);
        if (!inode)
-                goto fail;
+                goto free_fsi;
        inode->i_ino = 1;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        inode->i_blocks = 0;
-        inode->i_uid = inode->i_gid = 0;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
        inode->i_nlink = 2;
-        devpts_root = s->s_root = d_alloc_root(inode);
+        s->s_root = d_alloc_root(inode);
        if (s->s_root)
                return 0;
-        
-        printk("devpts: get root dentry failed\n");
+        printk(KERN_ERR "devpts: get root dentry failed\n");
        iput(inode);
+free_fsi:
+        kfree(s->s_fs_info);
 fail:
        return -ENOMEM;
 }
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int compare_init_pts_sb(struct super_block *s, void *p)
+{
+        if (devpts_mnt)
+                return devpts_mnt->mnt_sb == s;
+        return 0;
+}
+/*
+ * Safely parse the mount options in @data and update @opts.
+ *
+ * devpts ends up parsing options two times during mount, due to the
+ * two modes of operation it supports. The first parse occurs in
+ * devpts_get_sb() when determining the mode (single-instance or
+ * multi-instance mode). The second parse happens in devpts_remount()
+ * or new_pts_mount() depending on the mode.
+ *
+ * Parsing of options modifies the @data making subsequent parsing
+ * incorrect. So make a local copy of @data and parse it.
+ *
+ * Return: 0 On success, -errno on error
+ */
+static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts)
+{
+        int rc;
+        void *datacp;
+        if (!data)
+                return 0;
+        /* Use kstrdup() ?  */
+        datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!datacp)
+                return -ENOMEM;
+        memcpy(datacp, data, PAGE_SIZE);
+        rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
+        kfree(datacp);
+        return rc;
+}
+/*
+ * Mount a new (private) instance of devpts.  PTYs created in this
+ * instance are independent of the PTYs in other devpts instances.
+ */
+static int new_pts_mount(struct file_system_type *fs_type, int flags,
+                void *data, struct vfsmount *mnt)
+{
+        int err;
+        struct pts_fs_info *fsi;
+        struct pts_mount_opts *opts;
+        printk(KERN_NOTICE "devpts: newinstance mount\n");
+        err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
+        if (err)
+                return err;
+        fsi = DEVPTS_SB(mnt->mnt_sb);
+        opts = &fsi->mount_opts;
+        err = parse_mount_options(data, PARSE_MOUNT, opts);
+        if (err)
+                goto fail;
+        err = mknod_ptmx(mnt->mnt_sb);
+        if (err)
+                goto fail;
+        return 0;
+fail:
+        dput(mnt->mnt_sb->s_root);
+        deactivate_super(mnt->mnt_sb);
+        return err;
+}
+/*
+ * Check if 'newinstance' mount option was specified in @data.
+ *
+ * Return: -errno       on error (eg: invalid mount options specified)
+ *       : 1            if 'newinstance' mount option was specified
+ *       : 0            if 'newinstance' mount option was NOT specified
+ */
+static int is_new_instance_mount(void *data)
+{
+        int rc;
+        struct pts_mount_opts opts;
+        if (!data)
+                return 0;
+        rc = safe_parse_mount_options(data, &opts);
+        if (!rc)
+                rc = opts.newinstance;
+        return rc;
+}
+/*
+ * get_init_pts_sb()
+ *
+ *     This interface is needed to support multiple namespace semantics in
+ *     devpts while preserving backward compatibility of the current 'single-
+ *     namespace' semantics. i.e all mounts of devpts without the 'newinstance'
+ *     mount option should bind to the initial kernel mount, like
+ *     get_sb_single().
+ *
+ *     Mounts with 'newinstance' option create a new private namespace.
+ *
+ *     But for single-mount semantics, devpts cannot use get_sb_single(),
+ *     because get_sb_single()/sget() find and use the super-block from
+ *     the most recent mount of devpts. But that recent mount may be a
+ *     'newinstance' mount and get_sb_single() would pick the newinstance
+ *     super-block instead of the initial super-block.
+ *
+ *     This interface is identical to get_sb_single() except that it
+ *     consistently selects the 'single-namespace' superblock even in the
+ *     presence of the private namespace (i.e 'newinstance') super-blocks.
+ */
+static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
+                void *data, struct vfsmount *mnt)
+{
+        struct super_block *s;
+        int error;
+        s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
+        if (IS_ERR(s))
+                return PTR_ERR(s);
+        if (!s->s_root) {
+                s->s_flags = flags;
+                error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        return error;
+                }
+                s->s_flags |= MS_ACTIVE;
+        }
+        do_remount_sb(s, flags, data, 0);
+        return simple_set_mnt(mnt, s);
+}
+/*
+ * Mount or remount the initial kernel mount of devpts. This type of
+ * mount maintains the legacy, single-instance semantics, while the
+ * kernel still allows multiple-instances.
+ */
+static int init_pts_mount(struct file_system_type *fs_type, int flags,
+                void *data, struct vfsmount *mnt)
+{
+        int err;
+        err = get_init_pts_sb(fs_type, flags, data, mnt);
+        if (err)
+                return err;
+        err = mknod_ptmx(mnt->mnt_sb);
+        if (err) {
+                dput(mnt->mnt_sb->s_root);
+                deactivate_super(mnt->mnt_sb);
+        }
+        return err;
+}
 static int devpts_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
+        int new;
+        new = is_new_instance_mount(data);
+        if (new < 0)
+                return new;
+        if (new)
+                return new_pts_mount(fs_type, flags, data, mnt);
+        return init_pts_mount(fs_type, flags, data, mnt);
+}
+#else
+/*
+ * This supports only the legacy single-instance semantics (no
+ * multiple-instance semantics)
+ */
+static int devpts_get_sb(struct file_system_type *fs_type, int flags,
+                const char *dev_name, void *data, struct vfsmount *mnt)
+{
        return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
+#endif
+static void devpts_kill_sb(struct super_block *sb)
+{
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        kfree(fsi);
+        kill_litter_super(sb);
+}
 static struct file_system_type devpts_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "devpts",
        .get_sb         = devpts_get_sb,
-        .kill_sb        = kill_anon_super,
+        .kill_sb        = devpts_kill_sb,
 };
 /*
@@ -172,16 +532,17 @@ static struct file_system_type devpts_fs_type = {
 int devpts_new_index(struct inode *ptmx_inode)
 {
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        int index;
        int ida_ret;
 retry:
-        if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) {
+        if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
                return -ENOMEM;
-        }
        mutex_lock(&allocated_ptys_lock);
-        ida_ret = ida_get_new(&allocated_ptys, &index);
+        ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
        if (ida_ret < 0) {
                mutex_unlock(&allocated_ptys_lock);
                if (ida_ret == -EAGAIN)
@@ -190,7 +551,7 @@ retry:
        }
        if (index >= pty_limit) {
-                ida_remove(&allocated_ptys, index);
+                ida_remove(&fsi->allocated_ptys, index);
                mutex_unlock(&allocated_ptys_lock);
                return -EIO;
        }
@@ -200,18 +561,26 @@ retry:
 void devpts_kill_index(struct inode *ptmx_inode, int idx)
 {
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        mutex_lock(&allocated_ptys_lock);
-        ida_remove(&allocated_ptys, idx);
+        ida_remove(&fsi->allocated_ptys, idx);
        mutex_unlock(&allocated_ptys_lock);
 }
 int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 {
-        int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
+        /* tty layer puts index from devpts_new_index() in here */
+        int number = tty->index;
        struct tty_driver *driver = tty->driver;
        dev_t device = MKDEV(driver->major, driver->minor_start+number);
        struct dentry *dentry;
-        struct inode *inode = new_inode(devpts_mnt->mnt_sb);
+        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+        struct inode *inode = new_inode(sb);
+        struct dentry *root = sb->s_root;
+        struct pts_fs_info *fsi = DEVPTS_SB(sb);
+        struct pts_mount_opts *opts = &fsi->mount_opts;
        char s[12];
        /* We're supposed to be given the slave end of a pty */
@@ -221,25 +590,25 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
        if (!inode)
                return -ENOMEM;
-        inode->i_ino = number+2;
+        inode->i_ino = number + 3;
-        inode->i_uid = config.setuid ? config.uid : current_fsuid();
+        inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
-        inode->i_gid = config.setgid ? config.gid : current_fsgid();
+        inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-        init_special_inode(inode, S_IFCHR|config.mode, device);
+        init_special_inode(inode, S_IFCHR|opts->mode, device);
        inode->i_private = tty;
        tty->driver_data = inode;
        sprintf(s, "%d", number);
-        mutex_lock(&devpts_root->d_inode->i_mutex);
+        mutex_lock(&root->d_inode->i_mutex);
-        dentry = d_alloc_name(devpts_root, s);
+        dentry = d_alloc_name(root, s);
        if (!IS_ERR(dentry)) {
                d_add(dentry, inode);
-                fsnotify_create(devpts_root->d_inode, dentry);
+                fsnotify_create(root->d_inode, dentry);
        }
-        mutex_unlock(&devpts_root->d_inode->i_mutex);
+        mutex_unlock(&root->d_inode->i_mutex);
        return 0;
 }
@@ -256,20 +625,27 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
 void devpts_pty_kill(struct tty_struct *tty)
 {
        struct inode *inode = tty->driver_data;
+        struct super_block *sb = pts_sb_from_inode(inode);
+        struct dentry *root = sb->s_root;
        struct dentry *dentry;
        BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
-        mutex_lock(&devpts_root->d_inode->i_mutex);
+        mutex_lock(&root->d_inode->i_mutex);
        dentry = d_find_alias(inode);
-        if (dentry && !IS_ERR(dentry)) {
+        if (IS_ERR(dentry))
+                goto out;
+        if (dentry) {
                inode->i_nlink--;
                d_delete(dentry);
-                dput(dentry);
+                dput(dentry);   /* d_alloc_name() in devpts_pty_new() */
        }
-        mutex_unlock(&devpts_root->d_inode->i_mutex);
+        dput(dentry);           /* d_find_alias above */
+out:
+        mutex_unlock(&root->d_inode->i_mutex);
 }
 static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558dbe8b7..b6d43908ff7a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        retval = direct_io_worker(rw, iocb, inode, iov, offset,
                                nr_segs, blkbits, get_block, end_io, dio);
+        /*
+         * In case of error extending write may have instantiated a few
+         * blocks outside i_size. Trim these off again for DIO_LOCKING.
+         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+         * it's own meaner.
+         */
+        if (unlikely(retval < 0 && (rw & WRITE))) {
+                loff_t isize = i_size_read(inode);
+                if (end > isize && dio_lock_type == DIO_LOCKING)
+                        vmtruncate(inode, isize);
+        }
        if (rw == READ && dio_lock_type == DIO_LOCKING)
                release_i_mutex = 0;
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8bf31e3fbf01..dc2ad6008b2d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
        spin_unlock(&ast_queue_lock);
 }
-void dlm_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
        if (lkb->lkb_flags & DLM_IFL_USER) {
-                dlm_user_add_ast(lkb, type);
+                dlm_user_add_ast(lkb, type, bastmode);
                return;
        }
@@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type)
                list_add_tail(&lkb->lkb_astqueue, &ast_queue);
        }
        lkb->lkb_ast_type |= type;
+        if (bastmode)
+                lkb->lkb_bastmode = bastmode;
        spin_unlock(&ast_queue_lock);
        set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,50 +61,40 @@ static void process_asts(void)
        struct dlm_lkb *lkb;
        void (*cast) (void *astparam);
        void (*bast) (void *astparam, int mode);
-        int type = 0, found, bmode;
+        int type = 0, bastmode;
-        for (;;) {
+repeat:
-                found = 0;
+        spin_lock(&ast_queue_lock);
-                spin_lock(&ast_queue_lock);
+        list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
-                list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+                r = lkb->lkb_resource;
-                        r = lkb->lkb_resource;
+                ls = r->res_ls;
-                        ls = r->res_ls;
+                if (dlm_locking_stopped(ls))
-                        if (dlm_locking_stopped(ls))
+                        continue;
-                                continue;
-                        list_del(&lkb->lkb_astqueue);
-                        type = lkb->lkb_ast_type;
-                        lkb->lkb_ast_type = 0;
-                        found = 1;
-                        break;
-                }
-                spin_unlock(&ast_queue_lock);
-                if (!found)
+                list_del(&lkb->lkb_astqueue);
-                        break;
+                type = lkb->lkb_ast_type;
+                lkb->lkb_ast_type = 0;
+                bastmode = lkb->lkb_bastmode;
+                spin_unlock(&ast_queue_lock);
                cast = lkb->lkb_astfn;
                bast = lkb->lkb_bastfn;
-                bmode = lkb->lkb_bastmode;
                if ((type & AST_COMP) && cast)
                        cast(lkb->lkb_astparam);
-                /* FIXME: Is it safe to look at lkb_grmode here
-                   without doing a lock_rsb() ?
-                   Look at other checks in v1 to avoid basts. */
                if ((type & AST_BAST) && bast)
-                        if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
+                        bast(lkb->lkb_astparam, bastmode);
-                                bast(lkb->lkb_astparam, bmode);
                /* this removes the reference added by dlm_add_ast
                   and may result in the lkb being freed */
                dlm_put_lkb(lkb);
-                schedule();
+                cond_resched();
+                goto repeat;
        }
+        spin_unlock(&ast_queue_lock);
 }
 static inline int no_asts(void)
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 6ee276c74c52..1b5fc5f428fd 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
-void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 void dlm_del_ast(struct dlm_lkb *lkb);
 void dlm_astd_wake(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fc24f4507a3..2f107d1a6a45 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,7 @@ static struct dentry *dlm_root;
 struct rsb_iter {
        int entry;
-        int locks;
+        int format;
        int header;
        struct dlm_ls *ls;
        struct list_head *next;
@@ -60,8 +60,8 @@ static char *print_lockmode(int mode)
        }
 }
-static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
+static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
-                                struct dlm_rsb *res)
+                               struct dlm_rsb *res)
 {
        seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
@@ -83,7 +83,7 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
        seq_printf(s, "\n");
 }
-static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+static int print_format1(struct dlm_rsb *res, struct seq_file *s)
 {
        struct dlm_lkb *lkb;
        int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
@@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
        /* Print the locks attached to this resource */
        seq_printf(s, "Granted Queue\n");
        list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
-                print_resource_lock(s, lkb, res);
+                print_format1_lock(s, lkb, res);
        seq_printf(s, "Conversion Queue\n");
        list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
-                print_resource_lock(s, lkb, res);
+                print_format1_lock(s, lkb, res);
        seq_printf(s, "Waiting Queue\n");
        list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
-                print_resource_lock(s, lkb, res);
+                print_format1_lock(s, lkb, res);
        if (list_empty(&res->res_lookup))
                goto out;
@@ -160,23 +160,24 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
        return 0;
 }
-static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r)
+static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+                               struct dlm_rsb *r)
 {
-        unsigned int waiting = 0;
+        u64 xid = 0;
-        uint64_t xid = 0;
+        u64 us;
        if (lkb->lkb_flags & DLM_IFL_USER) {
                if (lkb->lkb_ua)
                        xid = lkb->lkb_ua->xid;
        }
-        if (lkb->lkb_timestamp)
+        /* microseconds since lkb was added to current queue */
-                waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp);
+        us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
-        /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms
+        /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
           r_nodeid r_len r_name */
-        seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n",
+        seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
                   lkb->lkb_id,
                   lkb->lkb_nodeid,
                   lkb->lkb_remid,
@@ -187,26 +188,114 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *
                   lkb->lkb_status,
                   lkb->lkb_grmode,
                   lkb->lkb_rqmode,
-                   waiting,
+                   (unsigned long long)us,
                   r->res_nodeid,
                   r->res_length,
                   r->res_name);
 }
-static int print_locks(struct dlm_rsb *r, struct seq_file *s)
+static int print_format2(struct dlm_rsb *r, struct seq_file *s)
 {
        struct dlm_lkb *lkb;
        lock_rsb(r);
        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
-                print_lock(s, lkb, r);
+                print_format2_lock(s, lkb, r);
        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
-                print_lock(s, lkb, r);
+                print_format2_lock(s, lkb, r);
        list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
-                print_lock(s, lkb, r);
+                print_format2_lock(s, lkb, r);
+        unlock_rsb(r);
+        return 0;
+}
+static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+                               int rsb_lookup)
+{
+        u64 xid = 0;
+        if (lkb->lkb_flags & DLM_IFL_USER) {
+                if (lkb->lkb_ua)
+                        xid = lkb->lkb_ua->xid;
+        }
+        seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+                   lkb->lkb_id,
+                   lkb->lkb_nodeid,
+                   lkb->lkb_remid,
+                   lkb->lkb_ownpid,
+                   (unsigned long long)xid,
+                   lkb->lkb_exflags,
+                   lkb->lkb_flags,
+                   lkb->lkb_status,
+                   lkb->lkb_grmode,
+                   lkb->lkb_rqmode,
+                   lkb->lkb_highbast,
+                   rsb_lookup,
+                   lkb->lkb_wait_type,
+                   lkb->lkb_lvbseq,
+                   (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+                   (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+}
+static int print_format3(struct dlm_rsb *r, struct seq_file *s)
+{
+        struct dlm_lkb *lkb;
+        int i, lvblen = r->res_ls->ls_lvblen;
+        int print_name = 1;
+        lock_rsb(r);
+        seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+                   r,
+                   r->res_nodeid,
+                   r->res_first_lkid,
+                   r->res_flags,
+                   !list_empty(&r->res_root_list),
+                   !list_empty(&r->res_recover_list),
+                   r->res_recover_locks_count,
+                   r->res_length);
+        for (i = 0; i < r->res_length; i++) {
+                if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+                        print_name = 0;
+        }
+        seq_printf(s, "%s", print_name ? "str " : "hex");
+        for (i = 0; i < r->res_length; i++) {
+                if (print_name)
+                        seq_printf(s, "%c", r->res_name[i]);
+                else
+                        seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+        }
+        seq_printf(s, "\n");
+        if (!r->res_lvbptr)
+                goto do_locks;
+        seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
+        for (i = 0; i < lvblen; i++)
+                seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
+        seq_printf(s, "\n");
+ do_locks:
+        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+                print_format3_lock(s, lkb, 0);
+        list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+                print_format3_lock(s, lkb, 0);
+        list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+                print_format3_lock(s, lkb, 0);
+        list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+                print_format3_lock(s, lkb, 1);
        unlock_rsb(r);
        return 0;
@@ -231,7 +320,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
                                break;
                        }
                        read_unlock(&ls->ls_rsbtbl[i].lock);
-                }
+                }
                ri->entry = i;
                if (ri->entry >= ls->ls_rsbtbl_size)
@@ -248,7 +337,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
                        read_unlock(&ls->ls_rsbtbl[i].lock);
                        dlm_put_rsb(old);
                        goto top;
-                }
+                }
                ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
                dlm_hold_rsb(ri->rsb);
                read_unlock(&ls->ls_rsbtbl[i].lock);
@@ -274,6 +363,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
        ri->ls = ls;
        ri->entry = 0;
        ri->next = NULL;
+        ri->format = 1;
        if (rsb_iter_next(ri)) {
                rsb_iter_free(ri);
@@ -325,16 +415,26 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
 {
        struct rsb_iter *ri = iter_ptr;
-        if (ri->locks) {
+        switch (ri->format) {
+        case 1:
+                print_format1(ri->rsb, file);
+                break;
+        case 2:
                if (ri->header) {
-                        seq_printf(file, "id nodeid remid pid xid exflags flags "
+                        seq_printf(file, "id nodeid remid pid xid exflags "
-                                         "sts grmode rqmode time_ms r_nodeid "
+                                         "flags sts grmode rqmode time_ms "
-                                         "r_len r_name\n");
+                                         "r_nodeid r_len r_name\n");
                        ri->header = 0;
                }
-                print_locks(ri->rsb, file);
+                print_format2(ri->rsb, file);
-        } else {
+                break;
-                print_resource(ri->rsb, file);
+        case 3:
+                if (ri->header) {
+                        seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+                        ri->header = 0;
+                }
+                print_format3(ri->rsb, file);
+                break;
        }
        return 0;
@@ -385,7 +485,7 @@ static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
        ri->ls = ls;
        ri->entry = 0;
        ri->next = NULL;
-        ri->locks = 1;
+        ri->format = 2;
        if (*pos == 0)
                ri->header = 1;
@@ -448,6 +548,84 @@ static const struct file_operations locks_fops = {
 };
 /*
+ * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks
+ * This can replace both formats 1 and 2 eventually.
+ */
+static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos)
+{
+        struct rsb_iter *ri;
+        ri = kzalloc(sizeof *ri, GFP_KERNEL);
+        if (!ri)
+                return NULL;
+        ri->ls = ls;
+        ri->entry = 0;
+        ri->next = NULL;
+        ri->format = 3;
+        if (*pos == 0)
+                ri->header = 1;
+        if (rsb_iter_next(ri)) {
+                rsb_iter_free(ri);
+                return NULL;
+        }
+        return ri;
+}
+static void *all_seq_start(struct seq_file *file, loff_t *pos)
+{
+        struct rsb_iter *ri;
+        loff_t n = *pos;
+        ri = all_iter_init(file->private, pos);
+        if (!ri)
+                return NULL;
+        while (n--) {
+                if (rsb_iter_next(ri)) {
+                        rsb_iter_free(ri);
+                        return NULL;
+                }
+        }
+        return ri;
+}
+static struct seq_operations all_seq_ops = {
+        .start = all_seq_start,
+        .next  = rsb_seq_next,
+        .stop  = rsb_seq_stop,
+        .show  = rsb_seq_show,
+};
+static int all_open(struct inode *inode, struct file *file)
+{
+        struct seq_file *seq;
+        int ret;
+        ret = seq_open(file, &all_seq_ops);
+        if (ret)
+                return ret;
+        seq = file->private_data;
+        seq->private = inode->i_private;
+        return 0;
+}
+static const struct file_operations all_fops = {
+        .owner   = THIS_MODULE,
+        .open    = all_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release
+};
+/*
 * dump lkb's on the ls_waiters list
 */
@@ -489,30 +667,33 @@ static const struct file_operations waiters_fops = {
        .read    = waiters_read
 };
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+        if (ls->ls_debug_rsb_dentry)
+                debugfs_remove(ls->ls_debug_rsb_dentry);
+        if (ls->ls_debug_waiters_dentry)
+                debugfs_remove(ls->ls_debug_waiters_dentry);
+        if (ls->ls_debug_locks_dentry)
+                debugfs_remove(ls->ls_debug_locks_dentry);
+        if (ls->ls_debug_all_dentry)
+                debugfs_remove(ls->ls_debug_all_dentry);
+}
 int dlm_create_debug_file(struct dlm_ls *ls)
 {
        char name[DLM_LOCKSPACE_LEN+8];
+        /* format 1 */
        ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
                                                      S_IFREG | S_IRUGO,
                                                      dlm_root,
                                                      ls,
                                                      &rsb_fops);
        if (!ls->ls_debug_rsb_dentry)
-                return -ENOMEM;
+                goto fail;
-        memset(name, 0, sizeof(name));
+        /* format 2 */
-        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
-        ls->ls_debug_waiters_dentry = debugfs_create_file(name,
-                                                          S_IFREG | S_IRUGO,
-                                                          dlm_root,
-                                                          ls,
-                                                          &waiters_fops);
-        if (!ls->ls_debug_waiters_dentry) {
-                debugfs_remove(ls->ls_debug_rsb_dentry);
-                return -ENOMEM;
-        }
        memset(name, 0, sizeof(name));
        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
@@ -522,23 +703,38 @@ int dlm_create_debug_file(struct dlm_ls *ls)
                                                        dlm_root,
                                                        ls,
                                                        &locks_fops);
-        if (!ls->ls_debug_locks_dentry) {
+        if (!ls->ls_debug_locks_dentry)
-                debugfs_remove(ls->ls_debug_waiters_dentry);
+                goto fail;
-                debugfs_remove(ls->ls_debug_rsb_dentry);
-                return -ENOMEM;
+        /* format 3 */
-        }
+        memset(name, 0, sizeof(name));
+        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
+        ls->ls_debug_all_dentry = debugfs_create_file(name,
+                                                      S_IFREG | S_IRUGO,
+                                                      dlm_root,
+                                                      ls,
+                                                      &all_fops);
+        if (!ls->ls_debug_all_dentry)
+                goto fail;
+        memset(name, 0, sizeof(name));
+        snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+        ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+                                                          S_IFREG | S_IRUGO,
+                                                          dlm_root,
+                                                          ls,
+                                                          &waiters_fops);
+        if (!ls->ls_debug_waiters_dentry)
+                goto fail;
        return 0;
-}
-void dlm_delete_debug_file(struct dlm_ls *ls)
+ fail:
-{
+        dlm_delete_debug_file(ls);
-        if (ls->ls_debug_rsb_dentry)
+        return -ENOMEM;
-                debugfs_remove(ls->ls_debug_rsb_dentry);
-        if (ls->ls_debug_waiters_dentry)
-                debugfs_remove(ls->ls_debug_waiters_dentry);
-        if (ls->ls_debug_locks_dentry)
-                debugfs_remove(ls->ls_debug_locks_dentry);
 }
 int __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 85defeb64df4..92969f879a17 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
        struct list_head *list;
        struct dlm_rsb *r;
        int offset = 0, dir_nodeid;
-        uint16_t be_namelen;
+        __be16 be_namelen;
        down_read(&ls->ls_root_sem);
@@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
                if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
                        /* Write end-of-block record */
-                        be_namelen = 0;
+                        be_namelen = cpu_to_be16(0);
-                        memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                        memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
-                        offset += sizeof(uint16_t);
+                        offset += sizeof(__be16);
                        goto out;
                }
                be_namelen = cpu_to_be16(r->res_length);
-                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
-                offset += sizeof(uint16_t);
+                offset += sizeof(__be16);
                memcpy(outbuf + offset, r->res_name, r->res_length);
                offset += r->res_length;
        }
@@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
        if ((list == &ls->ls_root_list) &&
            (offset + sizeof(uint16_t) <= outlen)) {
-                be_namelen = 0xFFFF;
+                be_namelen = cpu_to_be16(0xFFFF);
-                memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+                memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
-                offset += sizeof(uint16_t);
+                offset += sizeof(__be16);
        }
 out:
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 868e4c9ef127..ef2f1e353966 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -245,7 +245,8 @@ struct dlm_lkb {
        struct list_head        lkb_astqueue;   /* need ast to be sent */
        struct list_head        lkb_ownqueue;   /* list of locks for a process */
        struct list_head        lkb_time_list;
-        unsigned long           lkb_timestamp;
+        ktime_t                 lkb_time_bast;  /* for debugging */
+        ktime_t                 lkb_timestamp;
        unsigned long           lkb_timeout_cs;
        char                    *lkb_lvbptr;
@@ -481,6 +482,7 @@ struct dlm_ls {
        struct dentry           *ls_debug_rsb_dentry; /* debugfs */
        struct dentry           *ls_debug_waiters_dentry; /* debugfs */
        struct dentry           *ls_debug_locks_dentry; /* debugfs */
+        struct dentry           *ls_debug_all_dentry; /* debugfs */
        wait_queue_head_t       ls_uevent_wait; /* user part of join/leave */
        int                     ls_uevent_result;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 724ddac91538..6cfe65bbf4a2 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
        lkb->lkb_lksb->sb_status = rv;
        lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
-        dlm_add_ast(lkb, AST_COMP);
+        dlm_add_ast(lkb, AST_COMP, 0);
 }
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
+        lkb->lkb_time_bast = ktime_get();
        if (is_master_copy(lkb))
                send_bast(r, lkb, rqmode);
-        else {
+        else
-                lkb->lkb_bastmode = rqmode;
+                dlm_add_ast(lkb, AST_BAST, rqmode);
-                dlm_add_ast(lkb, AST_BAST);
-        }
 }
 /*
@@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
        DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+        lkb->lkb_timestamp = ktime_get();
        lkb->lkb_status = status;
        switch (status) {
@@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb)
 {
        struct dlm_ls *ls = lkb->lkb_resource->res_ls;
-        if (is_master_copy(lkb)) {
+        if (is_master_copy(lkb))
-                lkb->lkb_timestamp = jiffies;
                return;
-        }
        if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
            !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
@@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb)
        DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
        mutex_lock(&ls->ls_timeout_mutex);
        hold_lkb(lkb);
-        lkb->lkb_timestamp = jiffies;
        list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
        mutex_unlock(&ls->ls_timeout_mutex);
 }
@@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls)
        struct dlm_rsb *r;
        struct dlm_lkb *lkb;
        int do_cancel, do_warn;
+        s64 wait_us;
        for (;;) {
                if (dlm_locking_stopped(ls))
@@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls)
                mutex_lock(&ls->ls_timeout_mutex);
                list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+                        wait_us = ktime_to_us(ktime_sub(ktime_get(),
+                                                        lkb->lkb_timestamp));
                        if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
-                            time_after_eq(jiffies, lkb->lkb_timestamp +
+                            wait_us >= (lkb->lkb_timeout_cs * 10000))
-                                          lkb->lkb_timeout_cs * HZ/100))
                                do_cancel = 1;
                        if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
-                            time_after_eq(jiffies, lkb->lkb_timestamp +
+                            wait_us >= dlm_config.ci_timewarn_cs * 10000)
-                                           dlm_config.ci_timewarn_cs * HZ/100))
                                do_warn = 1;
                        if (!do_cancel && !do_warn)
@@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls)
 void dlm_adjust_timeouts(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
-        long adj = jiffies - ls->ls_recover_begin;
+        u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
        ls->ls_recover_begin = 0;
        mutex_lock(&ls->ls_timeout_mutex);
        list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
-                lkb->lkb_timestamp += adj;
+                lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
        mutex_unlock(&ls->ls_timeout_mutex);
 }
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3962262f991a..103a5ebd1371 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con)
        con->sock->sk->sk_write_space = lowcomms_write_space;
        con->sock->sk->sk_state_change = lowcomms_state_change;
        con->sock->sk->sk_user_data = con;
+        con->sock->sk->sk_allocation = GFP_NOFS;
        return 0;
 }
@@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con)
        len = e->len;
        offset = e->offset;
        spin_unlock(&con->writequeue_lock);
-        kmap(e->page);
        /* Send the first block off the write queue */
        iov[0].iov_base = page_address(e->page)+offset;
@@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con)
                if (e->len == 0 && e->users == 0) {
                        list_del(&e->list);
-                        kunmap(e->page);
                        free_entry(e);
                }
                spin_unlock(&con->writequeue_lock);
@@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
        if (e) {
        got_one:
-                if (users == 0)
-                        kmap(e->page);
                *ppc = page_address(e->page) + offset;
                return e;
        }
@@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
        if (users)
                goto out;
        e->len = e->end - e->offset;
-        kunmap(e->page);
        spin_unlock(&con->writequeue_lock);
        if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
@@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con)
                offset = e->offset;
                BUG_ON(len == 0 && e->users == 0);
                spin_unlock(&con->writequeue_lock);
-                kmap(e->page);
                ret = 0;
                if (len) {
@@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con)
                if (e->len == 0 && e->users == 0) {
                        list_del(&e->list);
-                        kunmap(e->page);
                        free_entry(e);
                        continue;
                }
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 54c14c6d06cb..c1775b84ebab 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
        char *p;
-        p = kzalloc(ls->ls_lvblen, GFP_KERNEL);
+        p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
        return p;
 }
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
        DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
-        r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL);
+        r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
        return r;
 }
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
        struct dlm_lkb *lkb;
-        lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL);
+        lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
        return lkb;
 }
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 07ac709f3ed7..f3396c622aec 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
                   ordinary messages). */
                if (msglen > sizeof(__tmp) && p == &__tmp.p) {
-                        p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+                        p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
                        if (p == NULL)
                                return ret;
                }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index aa2a5775a027..ccc9d62c462d 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
        data->status = lkb->lkb_status;
        data->grmode = lkb->lkb_grmode;
        data->rqmode = lkb->lkb_rqmode;
-        data->timestamp = lkb->lkb_timestamp;
        if (lkb->lkb_ua)
                data->xid = lkb->lkb_ua->xid;
        if (r) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b3832c67194a..065149e84f42 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
   being removed and then remove that lkb from the orphans list and free it */
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
        struct dlm_ls *ls;
        struct dlm_user_args *ua;
@@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
        ast_type = lkb->lkb_ast_type;
        lkb->lkb_ast_type |= type;
+        if (bastmode)
+                lkb->lkb_bastmode = bastmode;
        if (!ast_type) {
                kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 35eb6a13d616..1c9686492286 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,7 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581c..48c0571f831d 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
-static void dqput(struct dquot *dquot);
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
@@ -415,6 +413,17 @@ out_dqlock:
        return ret;
 }
+void dquot_destroy(struct dquot *dquot)
+{
+        kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+        dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
 /* Invalidate all dquots on the list. Note that this function is called after
 * quota is disabled and pointers from inodes removed so there cannot be new
 * quota users. There can still be some users of quotas due to inodes being
@@ -463,9 +472,44 @@ restart:
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
-                kmem_cache_free(dquot_cachep, dquot);
+                do_destroy_dquot(dquot);
+        }
+        spin_unlock(&dq_list_lock);
+}
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+                      int (*fn)(struct dquot *dquot, unsigned long priv),
+                      unsigned long priv)
+{
+        struct dquot *dquot, *old_dquot = NULL;
+        int ret = 0;
+        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        spin_lock(&dq_list_lock);
+        list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+                if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+                        continue;
+                if (dquot->dq_sb != sb)
+                        continue;
+                /* Now we have active dquot so we can just increase use count */
+                atomic_inc(&dquot->dq_count);
+                dqstats.lookups++;
+                spin_unlock(&dq_list_lock);
+                dqput(old_dquot);
+                old_dquot = dquot;
+                ret = fn(dquot, priv);
+                if (ret < 0)
+                        goto out;
+                spin_lock(&dq_list_lock);
+                /* We are safe to continue now because our dquot could not
+                 * be moved out of the inuse list while we hold the reference */
        }
        spin_unlock(&dq_list_lock);
+out:
+        dqput(old_dquot);
+        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+        return ret;
 }
 int vfs_quota_sync(struct super_block *sb, int type)
@@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
-                if (!sb_has_quota_enabled(sb, cnt))
+                if (!sb_has_quota_active(sb, cnt))
                        continue;
                spin_lock(&dq_list_lock);
                dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
        }
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)
+                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
-                        && info_dirty(&dqopt->info[cnt]))
+                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
        spin_lock(&dq_list_lock);
        dqstats.syncs++;
@@ -527,7 +571,7 @@ static void prune_dqcache(int count)
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
-                kmem_cache_free(dquot_cachep, dquot);
+                do_destroy_dquot(dquot);
                count--;
                head = free_dquots.prev;
        }
@@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = {
 * NOTE: If you change this function please check whether dqput_blocks() works right...
 * MUST be called with either dqptr_sem or dqonoff_mutex held
 */
-static void dqput(struct dquot *dquot)
+void dqput(struct dquot *dquot)
 {
        int ret;
@@ -584,7 +628,7 @@ we_slept:
                /* We have more than one user... nothing to do */
                atomic_dec(&dquot->dq_count);
                /* Releasing dquot during quotaoff phase? */
-                if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) &&
+                if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
                    atomic_read(&dquot->dq_count) == 1)
                        wake_up(&dquot->dq_wait_unused);
                spin_unlock(&dq_list_lock);
@@ -625,11 +669,17 @@ we_slept:
        spin_unlock(&dq_list_lock);
 }
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+        return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
 static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 {
        struct dquot *dquot;
-        dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+        dquot = sb->dq_op->alloc_dquot(sb, type);
        if(!dquot)
                return NODQUOT;
@@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 }
 /*
+ * Check whether dquot is in memory.
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
+ */
+int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
+{
+        unsigned int hashent = hashfn(sb, id, type);
+        int ret = 0;
+        if (!sb_has_quota_active(sb, type))
+                return 0;
+        spin_lock(&dq_list_lock);
+        if (find_dquot(hashent, sb, id, type) != NODQUOT)
+                ret = 1;
+        spin_unlock(&dq_list_lock);
+        return ret;
+}
+/*
 * Get reference to dquot
 * MUST be called with either dqptr_sem or dqonoff_mutex held
 */
-static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
        unsigned int hashent = hashfn(sb, id, type);
        struct dquot *dquot, *empty = NODQUOT;
-        if (!sb_has_quota_enabled(sb, type))
+        if (!sb_has_quota_active(sb, type))
                return NODQUOT;
 we_slept:
        spin_lock(&dq_list_lock);
@@ -682,7 +750,7 @@ we_slept:
                dqstats.lookups++;
                spin_unlock(&dq_list_lock);
                if (empty)
-                        kmem_cache_free(dquot_cachep, empty);
+                        do_destroy_dquot(empty);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is already
         * finished or it will be canceled due to dq_count > 1 test */
@@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
        }
 }
-static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
 {
        dquot->dq_dqb.dqb_curinodes += number;
 }
@@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
        dquot->dq_dqb.dqb_curspace += number;
 }
-static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
-        if (dquot->dq_dqb.dqb_curinodes > number)
+        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+            dquot->dq_dqb.dqb_curinodes >= number)
                dquot->dq_dqb.dqb_curinodes -= number;
        else
                dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
 static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 {
-        if (dquot->dq_dqb.dqb_curspace > number)
+        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+            dquot->dq_dqb.dqb_curspace >= number)
                dquot->dq_dqb.dqb_curspace -= number;
        else
                dquot->dq_dqb.dqb_curspace = 0;
-        if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+        if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
@@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
 }
 /* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
        *warntype = QUOTA_NL_NOWARN;
-        if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return QUOTA_OK;
        if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
        *warntype = QUOTA_NL_NOWARN;
-        if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                return QUOTA_OK;
        if (dquot->dq_dqb.dqb_bhardlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit &&
+            dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
                        *warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        }
        if (dquot->dq_dqb.dqb_bsoftlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
            !ignore_hardlimit(dquot)) {
                if (!prealloc)
@@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        }
        if (dquot->dq_dqb.dqb_bsoftlimit &&
-           toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (!prealloc) {
                        *warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
        return QUOTA_OK;
 }
-static int info_idq_free(struct dquot *dquot, ulong inodes)
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+            !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
                return QUOTA_NL_NOWARN;
        if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
 static int info_bdq_free(struct dquot *dquot, qsize_t space)
 {
        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-            toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+            dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_NOWARN;
-        if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
+        if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
-            dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_BSOFTBELOW;
-        if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
+        if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
-            toqb(dquot->dq_dqb.dqb_curspace - space) <
+            dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
-                                                dquot->dq_dqb.dqb_bhardlimit)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
 }
@@ -1166,17 +1237,23 @@ out_err:
 *      Release all quotas referenced by inode
 *      Transaction must be started at an entry
 */
-int dquot_drop(struct inode *inode)
+int dquot_drop_locked(struct inode *inode)
 {
        int cnt;
-        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (inode->i_dquot[cnt] != NODQUOT) {
                        dqput(inode->i_dquot[cnt]);
                        inode->i_dquot[cnt] = NODQUOT;
                }
        }
+        return 0;
+}
+int dquot_drop(struct inode *inode)
+{
+        down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        dquot_drop_locked(inode);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return 0;
 }
@@ -1264,7 +1341,7 @@ warn_put_all:
 /*
 * This operation can block, but only after everything is updated
 */
-int dquot_alloc_inode(const struct inode *inode, unsigned long number)
+int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 {
        int cnt, ret = NO_QUOTA;
        char warntype[MAXQUOTAS];
@@ -1349,7 +1426,7 @@ out_sub:
 /*
 * This operation can block, but only after everything is updated
 */
-int dquot_free_inode(const struct inode *inode, unsigned long number)
+int dquot_free_inode(const struct inode *inode, qsize_t number)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
@@ -1495,7 +1572,7 @@ warn_put_all:
 /* Wrapper for transferring ownership of an inode */
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 {
-        if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+        if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
                vfs_dq_init(inode);
                if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
                        return 1;
@@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = {
        .acquire_dquot  = dquot_acquire,
        .release_dquot  = dquot_release,
        .mark_dirty     = dquot_mark_dquot_dirty,
-        .write_info     = dquot_commit_info
+        .write_info     = dquot_commit_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
-static inline void set_enable_flags(struct quota_info *dqopt, int type)
-{
-        switch (type) {
-                case USRQUOTA:
-                        dqopt->flags |= DQUOT_USR_ENABLED;
-                        dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-                        break;
-                case GRPQUOTA:
-                        dqopt->flags |= DQUOT_GRP_ENABLED;
-                        dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-                        break;
-        }
-}
-static inline void reset_enable_flags(struct quota_info *dqopt, int type,
-                                      int remount)
-{
-        switch (type) {
-                case USRQUOTA:
-                        dqopt->flags &= ~DQUOT_USR_ENABLED;
-                        if (remount)
-                                dqopt->flags |= DQUOT_USR_SUSPENDED;
-                        else
-                                dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-                        break;
-                case GRPQUOTA:
-                        dqopt->flags &= ~DQUOT_GRP_ENABLED;
-                        if (remount)
-                                dqopt->flags |= DQUOT_GRP_SUSPENDED;
-                        else
-                                dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-                        break;
-        }
-}
 /*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 {
        int cnt, ret = 0;
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *toputinode[MAXQUOTAS];
+        /* Cannot turn off usage accounting without turning off limits, or
+         * suspend quotas and simultaneously turn quotas off. */
+        if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+            || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+            DQUOT_USAGE_ENABLED)))
+                return -EINVAL;
        /* We need to serialize quota_off() for device */
        mutex_lock(&dqopt->dqonoff_mutex);
@@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
         * sometimes we are called when fill_super() failed and calling
         * sync_fs() in such cases does no good.
         */
-        if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+        if (!sb_any_quota_loaded(sb)) {
                mutex_unlock(&dqopt->dqonoff_mutex);
                return 0;
        }
@@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                toputinode[cnt] = NULL;
                if (type != -1 && cnt != type)
                        continue;
-                /* If we keep inodes of quota files after remount and quotaoff
+                if (!sb_has_quota_loaded(sb, cnt))
-                 * is called, drop kept inodes. */
-                if (!remount && sb_has_quota_suspended(sb, cnt)) {
-                        iput(dqopt->files[cnt]);
-                        dqopt->files[cnt] = NULL;
-                        reset_enable_flags(dqopt, cnt, 0);
                        continue;
+                if (flags & DQUOT_SUSPENDED) {
+                        dqopt->flags |=
+                                dquot_state_flag(DQUOT_SUSPENDED, cnt);
+                } else {
+                        dqopt->flags &= ~dquot_state_flag(flags, cnt);
+                        /* Turning off suspended quotas? */
+                        if (!sb_has_quota_loaded(sb, cnt) &&
+                            sb_has_quota_suspended(sb, cnt)) {
+                                dqopt->flags &= ~dquot_state_flag(
+                                                        DQUOT_SUSPENDED, cnt);
+                                iput(dqopt->files[cnt]);
+                                dqopt->files[cnt] = NULL;
+                                continue;
+                        }
                }
-                if (!sb_has_quota_enabled(sb, cnt))
+                /* We still have to keep quota loaded? */
+                if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
                        continue;
-                reset_enable_flags(dqopt, cnt, remount);
                /* Note: these are blocking operations */
                drop_dquot_ref(sb, cnt);
@@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                put_quota_format(dqopt->info[cnt].dqi_format);
                toputinode[cnt] = dqopt->files[cnt];
-                if (!remount)
+                if (!sb_has_quota_loaded(sb, cnt))
                        dqopt->files[cnt] = NULL;
                dqopt->info[cnt].dqi_flags = 0;
                dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                dqopt->ops[cnt] = NULL;
        }
        mutex_unlock(&dqopt->dqonoff_mutex);
+        /* Skip syncing and setting flags if quota files are hidden */
+        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+                goto put_inodes;
        /* Sync the superblock so that buffers with quota data are written to
         * disk (and so userspace sees correct data afterwards). */
        if (sb->s_op->sync_fs)
@@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                        mutex_lock(&dqopt->dqonoff_mutex);
                        /* If quota was reenabled in the meantime, we have
                         * nothing to do */
-                        if (!sb_has_quota_enabled(sb, cnt)) {
+                        if (!sb_has_quota_loaded(sb, cnt)) {
                                mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
                                toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
                                  S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
                                mark_inode_dirty(toputinode[cnt]);
                        }
                        mutex_unlock(&dqopt->dqonoff_mutex);
+                }
+        if (sb->s_bdev)
+                invalidate_bdev(sb->s_bdev);
+put_inodes:
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (toputinode[cnt]) {
                        /* On remount RO, we keep the inode pointer so that we
-                         * can reenable quota on the subsequent remount RW.
+                         * can reenable quota on the subsequent remount RW. We
-                         * But we have better not keep inode pointer when there
+                         * have to check 'flags' variable and not use sb_has_
-                         * is pending delete on the quota file... */
+                         * function because another quotaon / quotaoff could
-                        if (!remount)
+                         * change global state before we got here. We refuse
+                         * to suspend quotas when there is pending delete on
+                         * the quota file... */
+                        if (!(flags & DQUOT_SUSPENDED))
                                iput(toputinode[cnt]);
                        else if (!toputinode[cnt]->i_nlink)
                                ret = -EBUSY;
                }
-        if (sb->s_bdev)
-                invalidate_bdev(sb->s_bdev);
        return ret;
 }
+int vfs_quota_off(struct super_block *sb, int type, int remount)
+{
+        return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+                                 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+}
 /*
 *      Turn quotas on on a device
 */
-/* Helper function when we already have the inode */
+/*
-static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+        unsigned int flags)
 {
        struct quota_format_type *fmt = find_quota_format(format_id);
        struct super_block *sb = inode->i_sb;
@@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
                error = -EINVAL;
                goto out_fmt;
        }
+        /* Usage always has to be set... */
+        if (!(flags & DQUOT_USAGE_ENABLED)) {
+                error = -EINVAL;
+                goto out_fmt;
+        }
-        /* As we bypass the pagecache we must now flush the inode so that
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
-         * we see all the changes from userspace... */
+                /* As we bypass the pagecache we must now flush the inode so
-        write_inode_now(inode, 1);
+                 * that we see all the changes from userspace... */
-        /* And now flush the block cache so that kernel sees the changes */
+                write_inode_now(inode, 1);
-        invalidate_bdev(sb->s_bdev);
+                /* And now flush the block cache so that kernel sees the
+                 * changes */
+                invalidate_bdev(sb->s_bdev);
+        }
        mutex_lock(&inode->i_mutex);
        mutex_lock(&dqopt->dqonoff_mutex);
-        if (sb_has_quota_enabled(sb, type) ||
+        if (sb_has_quota_loaded(sb, type)) {
-                        sb_has_quota_suspended(sb, type)) {
                error = -EBUSY;
                goto out_lock;
        }
-        /* We don't want quota and atime on quota files (deadlocks possible)
-         * Also nobody should write to the file - we use special IO operations
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
-         * which ignore the immutable bit. */
+                /* We don't want quota and atime on quota files (deadlocks
-        down_write(&dqopt->dqptr_sem);
+                 * possible) Also nobody should write to the file - we use
-        oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+                 * special IO operations which ignore the immutable bit. */
-        inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                down_write(&dqopt->dqptr_sem);
-        up_write(&dqopt->dqptr_sem);
+                oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
-        sb->dq_op->drop(inode);
+                inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                up_write(&dqopt->dqptr_sem);
+                sb->dq_op->drop(inode);
+        }
        error = -EIO;
        dqopt->files[type] = igrab(inode);
@@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
        }
        mutex_unlock(&dqopt->dqio_mutex);
        mutex_unlock(&inode->i_mutex);
-        set_enable_flags(dqopt, type);
+        dqopt->flags |= dquot_state_flag(flags, type);
        add_dquot_ref(sb, type);
        mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode;
        int ret;
+        unsigned int flags;
        mutex_lock(&dqopt->dqonoff_mutex);
        if (!sb_has_quota_suspended(sb, type)) {
                mutex_unlock(&dqopt->dqonoff_mutex);
                return 0;
        }
-        BUG_ON(sb_has_quota_enabled(sb, type));
        inode = dqopt->files[type];
        dqopt->files[type] = NULL;
-        reset_enable_flags(dqopt, type, 0);
+        flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+                                                DQUOT_LIMITS_ENABLED, type);
+        dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
        mutex_unlock(&dqopt->dqonoff_mutex);
-        ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id);
+        flags = dquot_generic_flag(flags, type);
+        ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+                                   flags);
        iput(inode);
        return ret;
@@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
        if (path->mnt->mnt_sb != sb)
                error = -EXDEV;
        else
-                error = vfs_quota_on_inode(path->dentry->d_inode, type,
+                error = vfs_load_quota_inode(path->dentry->d_inode, type,
-                                           format_id);
+                                             format_id, DQUOT_USAGE_ENABLED |
+                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-/* Actual function called from quotactl() */
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
                 int remount)
 {
@@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 }
 /*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+                unsigned int flags)
+{
+        int ret = 0;
+        struct super_block *sb = inode->i_sb;
+        struct quota_info *dqopt = sb_dqopt(sb);
+        /* Just unsuspend quotas? */
+        if (flags & DQUOT_SUSPENDED)
+                return vfs_quota_on_remount(sb, type);
+        if (!flags)
+                return 0;
+        /* Just updating flags needed? */
+        if (sb_has_quota_loaded(sb, type)) {
+                mutex_lock(&dqopt->dqonoff_mutex);
+                /* Now do a reliable test... */
+                if (!sb_has_quota_loaded(sb, type)) {
+                        mutex_unlock(&dqopt->dqonoff_mutex);
+                        goto load_quota;
+                }
+                if (flags & DQUOT_USAGE_ENABLED &&
+                    sb_has_quota_usage_enabled(sb, type)) {
+                        ret = -EBUSY;
+                        goto out_lock;
+                }
+                if (flags & DQUOT_LIMITS_ENABLED &&
+                    sb_has_quota_limits_enabled(sb, type)) {
+                        ret = -EBUSY;
+                        goto out_lock;
+                }
+                sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+out_lock:
+                mutex_unlock(&dqopt->dqonoff_mutex);
+                return ret;
+        }
+load_quota:
+        return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+/*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
@@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
        error = security_quota_on(dentry);
        if (!error)
-                error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+                error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+                                DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 out:
        dput(dentry);
@@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
        return ret;
 }
+static inline qsize_t qbtos(qsize_t blocks)
+{
+        return blocks << QIF_DQBLKSIZE_BITS;
+}
+static inline qsize_t stoqb(qsize_t space)
+{
+        return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        spin_lock(&dq_data_lock);
-        di->dqb_bhardlimit = dm->dqb_bhardlimit;
+        di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
-        di->dqb_bsoftlimit = dm->dqb_bsoftlimit;
+        di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
        di->dqb_curspace = dm->dqb_curspace;
        di->dqb_ihardlimit = dm->dqb_ihardlimit;
        di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1918,28 +2069,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
        if (di->dqb_valid & QIF_SPACE) {
                dm->dqb_curspace = di->dqb_curspace;
                check_blim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_BLIMITS) {
-                dm->dqb_bsoftlimit = di->dqb_bsoftlimit;
+                dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
-                dm->dqb_bhardlimit = di->dqb_bhardlimit;
+                dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
                check_blim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_INODES) {
                dm->dqb_curinodes = di->dqb_curinodes;
                check_ilim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
        if (di->dqb_valid & QIF_ILIMITS) {
                dm->dqb_isoftlimit = di->dqb_isoftlimit;
                dm->dqb_ihardlimit = di->dqb_ihardlimit;
                check_ilim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
-        if (di->dqb_valid & QIF_BTIME)
+        if (di->dqb_valid & QIF_BTIME) {
                dm->dqb_btime = di->dqb_btime;
-        if (di->dqb_valid & QIF_ITIME)
+                check_blim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+        }
+        if (di->dqb_valid & QIF_ITIME) {
                dm->dqb_itime = di->dqb_itime;
+                check_ilim = 1;
+                __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+        }
        if (check_blim) {
-                if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) {
+                if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                }
@@ -1970,12 +2131,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
        int rc;
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-        if (!(dquot = dqget(sb, id, type))) {
+        dquot = dqget(sb, id, type);
-                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+        if (!dquot) {
-                return -ESRCH;
+                rc = -ESRCH;
+                goto out;
        }
        rc = do_set_dqblk(dquot, di);
        dqput(dquot);
+out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return rc;
 }
@@ -1986,7 +2149,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        struct mem_dqinfo *mi;
  
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-        if (!sb_has_quota_enabled(sb, type)) {
+        if (!sb_has_quota_active(sb, type)) {
                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
                return -ESRCH;
        }
@@ -2005,11 +2168,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
+        int err = 0;
        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-        if (!sb_has_quota_enabled(sb, type)) {
+        if (!sb_has_quota_active(sb, type)) {
-                mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+                err = -ESRCH;
-                return -ESRCH;
+                goto out;
        }
        mi = sb_dqopt(sb)->info + type;
        spin_lock(&dq_data_lock);
@@ -2023,8 +2187,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        mark_info_dirty(sb, type);
        /* Force write to disk */
        sb->dq_op->write_info(sb, type);
+out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-        return 0;
+        return err;
 }
 struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2351,13 @@ EXPORT_SYMBOL(register_quota_format);
 EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(vfs_quota_enable);
 EXPORT_SYMBOL(vfs_quota_on);
 EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(vfs_quota_disable);
 EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_scan_active);
 EXPORT_SYMBOL(vfs_quota_sync);
 EXPORT_SYMBOL(vfs_get_dqinfo);
 EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2202,7 +2370,11 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(dquot_drop_locked);
 EXPORT_SYMBOL(vfs_dq_drop);
+EXPORT_SYMBOL(dqget);
+EXPORT_SYMBOL(dqput);
+EXPORT_SYMBOL(dquot_is_cached);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 6046239465a1..c01e043670e2 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -175,8 +175,8 @@ out:
 *
 * Returns zero on success; non-zero on error.
 */
-static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
-                              loff_t offset)
+                       loff_t offset)
 {
        int rc = 0;
        char dst[MD5_DIGEST_SIZE];
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
                crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+                crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
+                if (mount_crypt_stat->flags
+                    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
+                        crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
+                else if (mount_crypt_stat->flags
+                         & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
+                        crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
+        }
 }
 static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem {
 static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
        {0x00000001, ECRYPTFS_ENABLE_HMAC},
        {0x00000002, ECRYPTFS_ENCRYPTED},
-        {0x00000004, ECRYPTFS_METADATA_IN_XATTR}
+        {0x00000004, ECRYPTFS_METADATA_IN_XATTR},
+        {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
 };
 /**
@@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = {
 /**
 * ecryptfs_code_for_cipher_string
- * @crypt_stat: The cryptographic context
+ * @cipher_name: The string alias for the cipher
+ * @key_bytes: Length of key in bytes; used for AES code selection
 *
 * Returns zero on no match, or the cipher code on match
 */
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
 {
        int i;
        u8 code = 0;
        struct ecryptfs_cipher_code_str_map_elem *map =
                ecryptfs_cipher_code_str_map;
-        if (strcmp(crypt_stat->cipher, "aes") == 0) {
+        if (strcmp(cipher_name, "aes") == 0) {
-                switch (crypt_stat->key_size) {
+                switch (key_bytes) {
                case 16:
                        code = RFC2440_CIPHER_AES_128;
                        break;
@@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
                }
        } else {
                for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
-                        if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
+                        if (strcmp(cipher_name, map[i].cipher_str) == 0) {
                                code = map[i].cipher_code;
                                break;
                        }
@@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data,
                &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
        int rc;
+        if (crypt_stat->extent_size == 0)
+                crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
        rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
                                 ecryptfs_inode);
        if (rc) {
@@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data,
        }
        if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
                rc = -EINVAL;
-                ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n");
        }
 out:
        return rc;
@@ -1628,95 +1640,95 @@ out:
 }
 /**
- * ecryptfs_encode_filename - converts a plaintext file name to cipher text
+ * ecryptfs_encrypt_filename - encrypt filename
- * @crypt_stat: The crypt_stat struct associated with the file anem to encode
- * @name: The plaintext name
- * @length: The length of the plaintext
- * @encoded_name: The encypted name
 *
- * Encrypts and encodes a filename into something that constitutes a
+ * CBC-encrypts the filename. We do not want to encrypt the same
- * valid filename for a filesystem, with printable characters.
+ * filename with the same key and IV, which may happen with hard
+ * links, so we prepend random bits to each filename.
 *
- * We assume that we have a properly initialized crypto context,
+ * Returns zero on success; non-zero otherwise
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of encoded filename; negative if error
 */
-int
+static int
-ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
-                         const char *name, int length, char **encoded_name)
+                          struct ecryptfs_crypt_stat *crypt_stat,
+                          struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
 {
-        int error = 0;
+        int rc = 0;
-        (*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
+        filename->encrypted_filename = NULL;
-        if (!(*encoded_name)) {
+        filename->encrypted_filename_size = 0;
-                error = -ENOMEM;
+        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+                size_t packet_size;
+                size_t remaining_bytes;
+                rc = ecryptfs_write_tag_70_packet(
+                        NULL, NULL,
+                        &filename->encrypted_filename_size,
+                        mount_crypt_stat, NULL,
+                        filename->filename_size);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to get packet "
+                               "size for tag 72; rc = [%d]\n", __func__,
+                               rc);
+                        filename->encrypted_filename_size = 0;
+                        goto out;
+                }
+                filename->encrypted_filename =
+                        kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
+                if (!filename->encrypted_filename) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kmalloc [%zd] bytes\n", __func__,
+                               filename->encrypted_filename_size);
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                remaining_bytes = filename->encrypted_filename_size;
+                rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
+                                                  &remaining_bytes,
+                                                  &packet_size,
+                                                  mount_crypt_stat,
+                                                  filename->filename,
+                                                  filename->filename_size);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to generate "
+                               "tag 70 packet; rc = [%d]\n", __func__,
+                               rc);
+                        kfree(filename->encrypted_filename);
+                        filename->encrypted_filename = NULL;
+                        filename->encrypted_filename_size = 0;
+                        goto out;
+                }
+                filename->encrypted_filename_size = packet_size;
+        } else {
+                printk(KERN_ERR "%s: No support for requested filename "
+                       "encryption method in this release\n", __func__);
+                rc = -ENOTSUPP;
                goto out;
        }
-        /* TODO: Filename encryption is a scheduled feature for a
-         * future version of eCryptfs. This function is here only for
-         * the purpose of providing a framework for other developers
-         * to easily implement filename encryption. Hint: Replace this
-         * memcpy() with a call to encrypt and encode the
-         * filename, the set the length accordingly. */
-        memcpy((void *)(*encoded_name), (void *)name, length);
-        (*encoded_name)[length] = '\0';
-        error = length + 1;
 out:
-        return error;
+        return rc;
 }
-/**
+static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
- * ecryptfs_decode_filename - converts the cipher text name to plaintext
+                                  const char *name, size_t name_size)
- * @crypt_stat: The crypt_stat struct associated with the file
- * @name: The filename in cipher text
- * @length: The length of the cipher text name
- * @decrypted_name: The plaintext name
- *
- * Decodes and decrypts the filename.
- *
- * We assume that we have a properly initialized crypto context,
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of decoded filename; negative if error
- */
-int
-ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-                         const char *name, int length, char **decrypted_name)
 {
-        int error = 0;
+        int rc = 0;
-        (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
+        (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
-        if (!(*decrypted_name)) {
+        if (!(*copied_name)) {
-                error = -ENOMEM;
+                rc = -ENOMEM;
                goto out;
        }
-        /* TODO: Filename encryption is a scheduled feature for a
+        memcpy((void *)(*copied_name), (void *)name, name_size);
-         * future version of eCryptfs. This function is here only for
+        (*copied_name)[(name_size)] = '\0';     /* Only for convenience
-         * the purpose of providing a framework for other developers
-         * to easily implement filename encryption. Hint: Replace this
-         * memcpy() with a call to decode and decrypt the
-         * filename, the set the length accordingly. */
-        memcpy((void *)(*decrypted_name), (void *)name, length);
-        (*decrypted_name)[length + 1] = '\0';   /* Only for convenience
                                                 * in printing out the
                                                 * string in debug
                                                 * messages */
-        error = length;
+        (*copied_name_size) = (name_size + 1);
 out:
-        return error;
+        return rc;
 }
 /**
@@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
        *key_tfm = NULL;
        if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
                rc = -EINVAL;
-                printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
+                printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
                      "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
                goto out;
        }
@@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
        get_random_bytes(dummy_key, *key_size);
        rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
        if (rc) {
-                printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+                printk(KERN_ERR "Error attempting to set key of size [%zd] for "
                       "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
                rc = -EINVAL;
                goto out;
@@ -1910,3 +1922,341 @@ out:
        mutex_unlock(&key_tfm_list_mutex);
        return rc;
 }
+/* 64 characters forming a 6-bit target field */
+static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
+                                                 "EFGHIJKLMNOPQRST"
+                                                 "UVWXYZabcdefghij"
+                                                 "klmnopqrstuvwxyz");
+/* We could either offset on every reverse map or just pad some 0x00's
+ * at the front here */
+static const unsigned char filename_rev_map[] = {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
+        0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
+        0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
+        0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
+        0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
+        0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
+        0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
+        0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
+        0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
+        0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
+        0x3D, 0x3E, 0x3F
+};
+/**
+ * ecryptfs_encode_for_filename
+ * @dst: Destination location for encoded filename
+ * @dst_size: Size of the encoded filename in bytes
+ * @src: Source location for the filename to encode
+ * @src_size: Size of the source in bytes
+ */
+void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
+                                  unsigned char *src, size_t src_size)
+{
+        size_t num_blocks;
+        size_t block_num = 0;
+        size_t dst_offset = 0;
+        unsigned char last_block[3];
+        if (src_size == 0) {
+                (*dst_size) = 0;
+                goto out;
+        }
+        num_blocks = (src_size / 3);
+        if ((src_size % 3) == 0) {
+                memcpy(last_block, (&src[src_size - 3]), 3);
+        } else {
+                num_blocks++;
+                last_block[2] = 0x00;
+                switch (src_size % 3) {
+                case 1:
+                        last_block[0] = src[src_size - 1];
+                        last_block[1] = 0x00;
+                        break;
+                case 2:
+                        last_block[0] = src[src_size - 2];
+                        last_block[1] = src[src_size - 1];
+                }
+        }
+        (*dst_size) = (num_blocks * 4);
+        if (!dst)
+                goto out;
+        while (block_num < num_blocks) {
+                unsigned char *src_block;
+                unsigned char dst_block[4];
+                if (block_num == (num_blocks - 1))
+                        src_block = last_block;
+                else
+                        src_block = &src[block_num * 3];
+                dst_block[0] = ((src_block[0] >> 2) & 0x3F);
+                dst_block[1] = (((src_block[0] << 4) & 0x30)
+                                | ((src_block[1] >> 4) & 0x0F));
+                dst_block[2] = (((src_block[1] << 2) & 0x3C)
+                                | ((src_block[2] >> 6) & 0x03));
+                dst_block[3] = (src_block[2] & 0x3F);
+                dst[dst_offset++] = portable_filename_chars[dst_block[0]];
+                dst[dst_offset++] = portable_filename_chars[dst_block[1]];
+                dst[dst_offset++] = portable_filename_chars[dst_block[2]];
+                dst[dst_offset++] = portable_filename_chars[dst_block[3]];
+                block_num++;
+        }
+out:
+        return;
+}
+/**
+ * ecryptfs_decode_from_filename
+ * @dst: If NULL, this function only sets @dst_size and returns. If
+ *       non-NULL, this function decodes the encoded octets in @src
+ *       into the memory that @dst points to.
+ * @dst_size: Set to the size of the decoded string.
+ * @src: The encoded set of octets to decode.
+ * @src_size: The size of the encoded set of octets to decode.
+ */
+static void
+ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
+                              const unsigned char *src, size_t src_size)
+{
+        u8 current_bit_offset = 0;
+        size_t src_byte_offset = 0;
+        size_t dst_byte_offset = 0;
+        if (dst == NULL) {
+                /* Not exact; conservatively long. Every block of 4
+                 * encoded characters decodes into a block of 3
+                 * decoded characters. This segment of code provides
+                 * the caller with the maximum amount of allocated
+                 * space that @dst will need to point to in a
+                 * subsequent call. */
+                (*dst_size) = (((src_size + 1) * 3) / 4);
+                goto out;
+        }
+        while (src_byte_offset < src_size) {
+                unsigned char src_byte =
+                                filename_rev_map[(int)src[src_byte_offset]];
+                switch (current_bit_offset) {
+                case 0:
+                        dst[dst_byte_offset] = (src_byte << 2);
+                        current_bit_offset = 6;
+                        break;
+                case 6:
+                        dst[dst_byte_offset++] |= (src_byte >> 4);
+                        dst[dst_byte_offset] = ((src_byte & 0xF)
+                                                 << 4);
+                        current_bit_offset = 4;
+                        break;
+                case 4:
+                        dst[dst_byte_offset++] |= (src_byte >> 2);
+                        dst[dst_byte_offset] = (src_byte << 6);
+                        current_bit_offset = 2;
+                        break;
+                case 2:
+                        dst[dst_byte_offset++] |= (src_byte);
+                        dst[dst_byte_offset] = 0;
+                        current_bit_offset = 0;
+                        break;
+                }
+                src_byte_offset++;
+        }
+        (*dst_size) = dst_byte_offset;
+out:
+        return;
+}
+/**
+ * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * Returns zero on success; non-zero on otherwise
+ */
+int ecryptfs_encrypt_and_encode_filename(
+        char **encoded_name,
+        size_t *encoded_name_size,
+        struct ecryptfs_crypt_stat *crypt_stat,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+        const char *name, size_t name_size)
+{
+        size_t encoded_name_no_prefix_size;
+        int rc = 0;
+        (*encoded_name) = NULL;
+        (*encoded_name_size) = 0;
+        if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+            || (mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+                struct ecryptfs_filename *filename;
+                filename = kzalloc(sizeof(*filename), GFP_KERNEL);
+                if (!filename) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kzalloc [%zd] bytes\n", __func__,
+                               sizeof(*filename));
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                filename->filename = (char *)name;
+                filename->filename_size = name_size;
+                rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+                                               mount_crypt_stat);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to encrypt "
+                               "filename; rc = [%d]\n", __func__, rc);
+                        kfree(filename);
+                        goto out;
+                }
+                ecryptfs_encode_for_filename(
+                        NULL, &encoded_name_no_prefix_size,
+                        filename->encrypted_filename,
+                        filename->encrypted_filename_size);
+                if ((crypt_stat && (crypt_stat->flags
+                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+                    || (mount_crypt_stat
+                        && (mount_crypt_stat->flags
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+                        (*encoded_name_size) =
+                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+                                 + encoded_name_no_prefix_size);
+                else
+                        (*encoded_name_size) =
+                                (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+                                 + encoded_name_no_prefix_size);
+                (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
+                if (!(*encoded_name)) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kzalloc [%zd] bytes\n", __func__,
+                               (*encoded_name_size));
+                        rc = -ENOMEM;
+                        kfree(filename->encrypted_filename);
+                        kfree(filename);
+                        goto out;
+                }
+                if ((crypt_stat && (crypt_stat->flags
+                                    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+                    || (mount_crypt_stat
+                        && (mount_crypt_stat->flags
+                            & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+                        memcpy((*encoded_name),
+                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+                               ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
+                        ecryptfs_encode_for_filename(
+                            ((*encoded_name)
+                             + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
+                            &encoded_name_no_prefix_size,
+                            filename->encrypted_filename,
+                            filename->encrypted_filename_size);
+                        (*encoded_name_size) =
+                                (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+                                 + encoded_name_no_prefix_size);
+                        (*encoded_name)[(*encoded_name_size)] = '\0';
+                        (*encoded_name_size)++;
+                } else {
+                        rc = -ENOTSUPP;
+                }
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to encode "
+                               "encrypted filename; rc = [%d]\n", __func__,
+                               rc);
+                        kfree((*encoded_name));
+                        (*encoded_name) = NULL;
+                        (*encoded_name_size) = 0;
+                }
+                kfree(filename->encrypted_filename);
+                kfree(filename);
+        } else {
+                rc = ecryptfs_copy_filename(encoded_name,
+                                            encoded_name_size,
+                                            name, name_size);
+        }
+out:
+        return rc;
+}
+/**
+ * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
+ * @plaintext_name: The plaintext name
+ * @plaintext_name_size: The plaintext name size
+ * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @name: The filename in cipher text
+ * @name_size: The cipher text name size
+ *
+ * Decrypts and decodes the filename.
+ *
+ * Returns zero on error; non-zero otherwise
+ */
+int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
+                                         size_t *plaintext_name_size,
+                                         struct dentry *ecryptfs_dir_dentry,
+                                         const char *name, size_t name_size)
+{
+        char *decoded_name;
+        size_t decoded_name_size;
+        size_t packet_size;
+        int rc = 0;
+        if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+            && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+                        ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
+                struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+                        &ecryptfs_superblock_to_private(
+                                ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
+                const char *orig_name = name;
+                size_t orig_name_size = name_size;
+                name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+                name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+                ecryptfs_decode_from_filename(NULL, &decoded_name_size,
+                                              name, name_size);
+                decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
+                if (!decoded_name) {
+                        printk(KERN_ERR "%s: Out of memory whilst attempting "
+                               "to kmalloc [%zd] bytes\n", __func__,
+                               decoded_name_size);
+                        rc = -ENOMEM;
+                        goto out;
+                }
+                ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
+                                              name, name_size);
+                rc = ecryptfs_parse_tag_70_packet(plaintext_name,
+                                                  plaintext_name_size,
+                                                  &packet_size,
+                                                  mount_crypt_stat,
+                                                  decoded_name,
+                                                  decoded_name_size);
+                if (rc) {
+                        printk(KERN_INFO "%s: Could not parse tag 70 packet "
+                               "from filename; copying through filename "
+                               "as-is\n", __func__);
+                        rc = ecryptfs_copy_filename(plaintext_name,
+                                                    plaintext_name_size,
+                                                    orig_name, orig_name_size);
+                        goto out_free;
+                }
+        } else {
+                rc = ecryptfs_copy_filename(plaintext_name,
+                                            plaintext_name_size,
+                                            name, name_size);
+                goto out;
+        }
+out_free:
+        kfree(decoded_name);
+out:
+        return rc;
+}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a75026d35d16..c11fc95714ab 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -51,12 +51,16 @@
 #define ECRYPTFS_VERSIONING_XATTR                 0x00000010
 #define ECRYPTFS_VERSIONING_MULTKEY               0x00000020
 #define ECRYPTFS_VERSIONING_DEVMISC               0x00000040
+#define ECRYPTFS_VERSIONING_HMAC                  0x00000080
+#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION   0x00000100
+#define ECRYPTFS_VERSIONING_GCM                   0x00000200
 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
                                  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
                                  | ECRYPTFS_VERSIONING_PUBKEY \
                                  | ECRYPTFS_VERSIONING_XATTR \
                                  | ECRYPTFS_VERSIONING_MULTKEY \
-                                  | ECRYPTFS_VERSIONING_DEVMISC)
+                                  | ECRYPTFS_VERSIONING_DEVMISC \
+                                  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
 #define ECRYPTFS_MAX_PASSWORD_LENGTH 64
 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
 #define ECRYPTFS_SALT_SIZE 8
@@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_DEFAULT_CIPHER "aes"
 #define ECRYPTFS_DEFAULT_KEY_BYTES 16
 #define ECRYPTFS_DEFAULT_HASH "md5"
+#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
 #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
 #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
 #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
 #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
+#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
+                                          * as dentry name */
+#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
+                                          * metadata */
+#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
+                                          * dentry name */
+#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
+                                          * metadata */
+/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
+ * ECRYPTFS_MAX_IV_BYTES */
+#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
+#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
 #define MD5_DIGEST_SIZE 16
+#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
+#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
 struct ecryptfs_key_sig {
        struct list_head crypt_stat_list;
        char keysig[ECRYPTFS_SIG_SIZE_HEX];
 };
+struct ecryptfs_filename {
+        struct list_head crypt_stat_list;
+#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
+        u32 flags;
+        u32 seq_no;
+        char *filename;
+        char *encrypted_filename;
+        size_t filename_size;
+        size_t encrypted_filename_size;
+        char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
+        char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
+};
 /**
 * This is the primary struct associated with each encrypted file.
 *
 * TODO: cache align/pack?
 */
 struct ecryptfs_crypt_stat {
-#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
+#define ECRYPTFS_STRUCT_INITIALIZED   0x00000001
-#define ECRYPTFS_POLICY_APPLIED     0x00000002
+#define ECRYPTFS_POLICY_APPLIED       0x00000002
-#define ECRYPTFS_NEW_FILE           0x00000004
+#define ECRYPTFS_NEW_FILE             0x00000004
-#define ECRYPTFS_ENCRYPTED          0x00000008
+#define ECRYPTFS_ENCRYPTED            0x00000008
-#define ECRYPTFS_SECURITY_WARNING   0x00000010
+#define ECRYPTFS_SECURITY_WARNING     0x00000010
-#define ECRYPTFS_ENABLE_HMAC        0x00000020
+#define ECRYPTFS_ENABLE_HMAC          0x00000020
-#define ECRYPTFS_ENCRYPT_IV_PAGES   0x00000040
+#define ECRYPTFS_ENCRYPT_IV_PAGES     0x00000040
-#define ECRYPTFS_KEY_VALID          0x00000080
+#define ECRYPTFS_KEY_VALID            0x00000080
-#define ECRYPTFS_METADATA_IN_XATTR  0x00000100
+#define ECRYPTFS_METADATA_IN_XATTR    0x00000100
-#define ECRYPTFS_VIEW_AS_ENCRYPTED  0x00000200
+#define ECRYPTFS_VIEW_AS_ENCRYPTED    0x00000200
-#define ECRYPTFS_KEY_SET            0x00000400
+#define ECRYPTFS_KEY_SET              0x00000400
+#define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
+#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
+#define ECRYPTFS_ENCFN_USE_FEK        0x00002000
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
@@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_XATTR_METADATA_ENABLED        0x00000002
 #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED        0x00000004
 #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED  0x00000008
+#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
+#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
+#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
        u32 flags;
        struct list_head global_auth_tok_list;
        struct mutex global_auth_tok_list_mutex;
        size_t num_global_auth_toks;
        size_t global_default_cipher_key_size;
+        size_t global_default_fn_cipher_key_bytes;
        unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
                                                 + 1];
+        unsigned char global_default_fn_cipher_name[
+                ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+        char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
 };
 /* superblock private data. */
@@ -571,13 +617,22 @@ struct ecryptfs_open_req {
 int ecryptfs_interpose(struct dentry *hidden_dentry,
                       struct dentry *this_dentry, struct super_block *sb,
                       u32 flags);
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+                                        struct dentry *lower_dentry,
+                                        struct ecryptfs_crypt_stat *crypt_stat,
+                                        struct inode *ecryptfs_dir_inode,
+                                        struct nameidata *ecryptfs_nd);
+int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
+                                         size_t *decrypted_name_size,
+                                         struct dentry *ecryptfs_dentry,
+                                         const char *name, size_t name_size);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
-int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+int ecryptfs_encrypt_and_encode_filename(
-                             const char *name, int length,
+        char **encoded_name,
-                             char **decrypted_name);
+        size_t *encoded_name_size,
-int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+        struct ecryptfs_crypt_stat *crypt_stat,
-                             const char *name, int length,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
-                             char **encoded_name);
+        const char *name, size_t name_size);
 struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
 void ecryptfs_dump_hex(char *data, int bytes);
 int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
@@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
                                             struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
                                            struct dentry *ecryptfs_dentry);
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
 int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
 void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
 int ecryptfs_generate_key_packet_set(char *dest_base,
@@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file,
                             struct vfsmount *lower_mnt,
                             const struct cred *cred);
 int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *filename, size_t filename_size);
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *data, size_t max_packet_size);
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+                       loff_t offset);
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac06..9e944057001b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback {
 /* Inspired by generic filldir in fs/readdir.c */
 static int
-ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
+ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
-                 u64 ino, unsigned int d_type)
+                 loff_t offset, u64 ino, unsigned int d_type)
 {
-        struct ecryptfs_crypt_stat *crypt_stat;
        struct ecryptfs_getdents_callback *buf =
            (struct ecryptfs_getdents_callback *)dirent;
+        size_t name_size;
+        char *name;
        int rc;
-        int decoded_length;
-        char *decoded_name;
-        crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
        buf->filldir_called++;
-        decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
+        rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
-                                                  &decoded_name);
+                                                  buf->dentry, lower_name,
-        if (decoded_length < 0) {
+                                                  lower_namelen);
-                rc = decoded_length;
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to decode and decrypt "
+                       "filename [%s]; rc = [%d]\n", __func__, lower_name,
+                       rc);
                goto out;
        }
-        rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
+        rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
-                          ino, d_type);
+        kfree(name);
-        kfree(decoded_name);
        if (rc >= 0)
                buf->entries_written++;
 out:
@@ -106,8 +106,8 @@ out:
 /**
 * ecryptfs_readdir
- * @file: The ecryptfs file struct
+ * @file: The eCryptfs directory file
- * @dirent: Directory entry
+ * @dirent: Directory entry handle
 * @filldir: The filldir callback function
 */
 static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-        struct file *lower_file = ecryptfs_file_to_lower(file);
+        return vfs_fsync(ecryptfs_file_to_lower(file),
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+                         ecryptfs_dentry_to_lower(dentry),
-        struct inode *lower_inode = lower_dentry->d_inode;
+                         datasync);
-        int rc = -EINVAL;
-        if (lower_inode->i_fop->fsync) {
-                mutex_lock(&lower_inode->i_mutex);
-                rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
-                                               datasync);
-                mutex_unlock(&lower_inode->i_mutex);
-        }
-        return rc;
 }
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 89209f00f9c7..5697899a168d 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir)
 /**
 * ecryptfs_create_underlying_file
 * @lower_dir_inode: inode of the parent in the lower fs of the new file
- * @lower_dentry: New file's dentry in the lower fs
+ * @dentry: New file's dentry
- * @ecryptfs_dentry: New file's dentry in ecryptfs
 * @mode: The mode of the new file
 * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
 *
@@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 {
        int rc;
-        /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens
+        /* ecryptfs_do_create() calls ecryptfs_interpose() */
-         * the crypt_stat->lower_file (persistent file) */
        rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
        if (unlikely(rc)) {
                ecryptfs_printk(KERN_WARNING, "Failed to create file in"
@@ -244,141 +242,91 @@ out:
 }
 /**
- * ecryptfs_lookup
+ * ecryptfs_lookup_and_interpose_lower - Perform a lookup
- * @dir: inode
- * @dentry: The dentry
- * @nd: nameidata, may be NULL
- *
- * Find a file on disk. If the file does not exist, then we'll add it to the
- * dentry cache and continue on to read it from the disk.
 */
-static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
-                                      struct nameidata *nd)
+                                        struct dentry *lower_dentry,
+                                        struct ecryptfs_crypt_stat *crypt_stat,
+                                        struct inode *ecryptfs_dir_inode,
+                                        struct nameidata *ecryptfs_nd)
 {
-        int rc = 0;
        struct dentry *lower_dir_dentry;
-        struct dentry *lower_dentry;
        struct vfsmount *lower_mnt;
-        char *encoded_name;
+        struct inode *lower_inode;
-        int encoded_namelen;
-        struct ecryptfs_crypt_stat *crypt_stat = NULL;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
        char *page_virt = NULL;
-        struct inode *lower_inode;
        u64 file_size;
+        int rc = 0;
-        lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+        lower_dir_dentry = lower_dentry->d_parent;
-        dentry->d_op = &ecryptfs_dops;
+        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-        if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
+                                   ecryptfs_dentry->d_parent));
-            || (dentry->d_name.len == 2
-                && !strcmp(dentry->d_name.name, ".."))) {
-                d_drop(dentry);
-                goto out;
-        }
-        encoded_namelen = ecryptfs_encode_filename(crypt_stat,
-                                                   dentry->d_name.name,
-                                                   dentry->d_name.len,
-                                                   &encoded_name);
-        if (encoded_namelen < 0) {
-                rc = encoded_namelen;
-                d_drop(dentry);
-                goto out;
-        }
-        ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
-                        "= [%d]\n", encoded_name, encoded_namelen);
-        lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
-                                      encoded_namelen - 1);
-        kfree(encoded_name);
-        if (IS_ERR(lower_dentry)) {
-                ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
-                rc = PTR_ERR(lower_dentry);
-                d_drop(dentry);
-                goto out;
-        }
-        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
-        ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
-                "d_name.name = [%s]\n", lower_dentry,
-                lower_dentry->d_name.name);
        lower_inode = lower_dentry->d_inode;
-        fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode);
+        fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
        BUG_ON(!atomic_read(&lower_dentry->d_count));
-        ecryptfs_set_dentry_private(dentry,
+        ecryptfs_set_dentry_private(ecryptfs_dentry,
                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
                                                     GFP_KERNEL));
-        if (!ecryptfs_dentry_to_private(dentry)) {
+        if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
                rc = -ENOMEM;
-                ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
+                printk(KERN_ERR "%s: Out of memory whilst attempting "
-                                "to allocate ecryptfs_dentry_info struct\n");
+                       "to allocate ecryptfs_dentry_info struct\n",
+                        __func__);
                goto out_dput;
        }
-        ecryptfs_set_dentry_lower(dentry, lower_dentry);
+        ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
-        ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+        ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
        if (!lower_dentry->d_inode) {
                /* We want to add because we couldn't find in lower */
-                d_add(dentry, NULL);
+                d_add(ecryptfs_dentry, NULL);
                goto out;
        }
-        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
+        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
-                                ECRYPTFS_INTERPOSE_FLAG_D_ADD);
+                                ecryptfs_dir_inode->i_sb, 1);
        if (rc) {
-                ecryptfs_printk(KERN_ERR, "Error interposing\n");
+                printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
+                       __func__, rc);
                goto out;
        }
-        if (S_ISDIR(lower_inode->i_mode)) {
+        if (S_ISDIR(lower_inode->i_mode))
-                ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
                goto out;
-        }
+        if (S_ISLNK(lower_inode->i_mode))
-        if (S_ISLNK(lower_inode->i_mode)) {
-                ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
                goto out;
-        }
+        if (special_file(lower_inode->i_mode))
-        if (special_file(lower_inode->i_mode)) {
-                ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
                goto out;
-        }
+        if (!ecryptfs_nd)
-        if (!nd) {
-                ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
-                                "as we *think* we are about to unlink\n");
                goto out;
-        }
        /* Released in this function */
-        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2,
+        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
-                                      GFP_USER);
        if (!page_virt) {
+                printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
+                       __func__);
                rc = -ENOMEM;
-                ecryptfs_printk(KERN_ERR,
-                                "Cannot ecryptfs_kmalloc a page\n");
                goto out;
        }
-        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+        if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
-        if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+                rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
-                ecryptfs_set_default_sizes(crypt_stat);
-        if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
-                rc = ecryptfs_init_persistent_file(dentry);
                if (rc) {
                        printk(KERN_ERR "%s: Error attempting to initialize "
                               "the persistent file for the dentry with name "
                               "[%s]; rc = [%d]\n", __func__,
-                               dentry->d_name.name, rc);
+                               ecryptfs_dentry->d_name.name, rc);
-                        goto out;
+                        goto out_free_kmem;
                }
        }
        rc = ecryptfs_read_and_validate_header_region(page_virt,
-                                                      dentry->d_inode);
+                                                      ecryptfs_dentry->d_inode);
        if (rc) {
-                rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry);
+                rc = ecryptfs_read_and_validate_xattr_region(page_virt,
+                                                             ecryptfs_dentry);
                if (rc) {
-                        printk(KERN_DEBUG "Valid metadata not found in header "
-                               "region or xattr region; treating file as "
-                               "unencrypted\n");
                        rc = 0;
-                        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+                        goto out_free_kmem;
-                        goto out;
                }
                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
        }
        mount_crypt_stat = &ecryptfs_superblock_to_private(
-                dentry->d_sb)->mount_crypt_stat;
+                ecryptfs_dentry->d_sb)->mount_crypt_stat;
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
                if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
                        file_size = (crypt_stat->num_header_bytes_at_front
@@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
        } else {
                file_size = get_unaligned_be64(page_virt);
        }
-        i_size_write(dentry->d_inode, (loff_t)file_size);
+        i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
+out_free_kmem:
        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
        goto out;
 out_dput:
        dput(lower_dentry);
-        d_drop(dentry);
+        d_drop(ecryptfs_dentry);
 out:
+        return rc;
+}
+/**
+ * ecryptfs_lookup
+ * @ecryptfs_dir_inode: The eCryptfs directory inode
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @ecryptfs_nd: nameidata; may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
+                                      struct dentry *ecryptfs_dentry,
+                                      struct nameidata *ecryptfs_nd)
+{
+        char *encrypted_and_encoded_name = NULL;
+        size_t encrypted_and_encoded_name_size;
+        struct ecryptfs_crypt_stat *crypt_stat = NULL;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+        struct ecryptfs_inode_info *inode_info;
+        struct dentry *lower_dir_dentry, *lower_dentry;
+        int rc = 0;
+        ecryptfs_dentry->d_op = &ecryptfs_dops;
+        if ((ecryptfs_dentry->d_name.len == 1
+             && !strcmp(ecryptfs_dentry->d_name.name, "."))
+            || (ecryptfs_dentry->d_name.len == 2
+                && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
+                goto out_d_drop;
+        }
+        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+                                      lower_dir_dentry,
+                                      ecryptfs_dentry->d_name.len);
+        if (IS_ERR(lower_dentry)) {
+                rc = PTR_ERR(lower_dentry);
+                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                       "lower_dentry = [%s]\n", __func__, rc,
+                       ecryptfs_dentry->d_name.name);
+                goto out_d_drop;
+        }
+        if (lower_dentry->d_inode)
+                goto lookup_and_interpose;
+        inode_info =  ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+        if (inode_info) {
+                crypt_stat = &inode_info->crypt_stat;
+                /* TODO: lock for crypt_stat comparison */
+                if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+                        ecryptfs_set_default_sizes(crypt_stat);
+        }
+        if (crypt_stat)
+                mount_crypt_stat = crypt_stat->mount_crypt_stat;
+        else
+                mount_crypt_stat = &ecryptfs_superblock_to_private(
+                        ecryptfs_dentry->d_sb)->mount_crypt_stat;
+        if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+            && !(mount_crypt_stat && (mount_crypt_stat->flags
+                                     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+                goto lookup_and_interpose;
+        dput(lower_dentry);
+        rc = ecryptfs_encrypt_and_encode_filename(
+                &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
+                crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+                ecryptfs_dentry->d_name.len);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to encrypt and encode "
+                       "filename; rc = [%d]\n", __func__, rc);
+                goto out_d_drop;
+        }
+        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+                                      lower_dir_dentry,
+                                      encrypted_and_encoded_name_size - 1);
+        if (IS_ERR(lower_dentry)) {
+                rc = PTR_ERR(lower_dentry);
+                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                       "lower_dentry = [%s]\n", __func__, rc,
+                       encrypted_and_encoded_name);
+                goto out_d_drop;
+        }
+lookup_and_interpose:
+        rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
+                                                 crypt_stat, ecryptfs_dir_inode,
+                                                 ecryptfs_nd);
+        goto out;
+out_d_drop:
+        d_drop(ecryptfs_dentry);
+out:
+        kfree(encrypted_and_encoded_name);
        return ERR_PTR(rc);
 }
@@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
        struct dentry *lower_dentry;
        struct dentry *lower_dir_dentry;
        char *encoded_symname;
-        int encoded_symlen;
+        size_t encoded_symlen;
-        struct ecryptfs_crypt_stat *crypt_stat = NULL;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        dget(lower_dentry);
        lower_dir_dentry = lock_parent(lower_dentry);
-        encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
-                                                  strlen(symname),
+                dir->i_sb)->mount_crypt_stat;
-                                                  &encoded_symname);
+        rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
-        if (encoded_symlen < 0) {
+                                                  &encoded_symlen,
-                rc = encoded_symlen;
+                                                  NULL,
+                                                  mount_crypt_stat, symname,
+                                                  strlen(symname));
+        if (rc)
                goto out_lock;
-        }
        rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
                         encoded_symname);
        kfree(encoded_symname);
@@ -602,53 +641,54 @@ out_lock:
 }
 static int
-ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 {
-        int rc;
-        struct dentry *lower_dentry;
-        char *decoded_name;
        char *lower_buf;
-        mm_segment_t old_fs;
+        struct dentry *lower_dentry;
        struct ecryptfs_crypt_stat *crypt_stat;
+        char *plaintext_name;
+        size_t plaintext_name_size;
+        mm_segment_t old_fs;
+        int rc;
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        if (!lower_dentry->d_inode->i_op ||
+        if (!lower_dentry->d_inode->i_op->readlink) {
-            !lower_dentry->d_inode->i_op->readlink) {
                rc = -EINVAL;
                goto out;
        }
+        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
        /* Released in this function */
        lower_buf = kmalloc(bufsiz, GFP_KERNEL);
        if (lower_buf == NULL) {
-                ecryptfs_printk(KERN_ERR, "Out of memory\n");
+                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+                       "kmalloc [%d] bytes\n", __func__, bufsiz);
                rc = -ENOMEM;
                goto out;
        }
        old_fs = get_fs();
        set_fs(get_ds());
-        ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-                        "lower_dentry->d_name.name = [%s]\n",
-                        lower_dentry->d_name.name);
        rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
                                                   (char __user *)lower_buf,
                                                   bufsiz);
        set_fs(old_fs);
        if (rc >= 0) {
-                crypt_stat = NULL;
+                rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
-                rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
+                                                          &plaintext_name_size,
-                                              &decoded_name);
+                                                          dentry, lower_buf,
-                if (rc == -ENOMEM)
+                                                          rc);
+                if (rc) {
+                        printk(KERN_ERR "%s: Error attempting to decode and "
+                               "decrypt filename; rc = [%d]\n", __func__,
+                                rc);
                        goto out_free_lower_buf;
-                if (rc > 0) {
-                        ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
-                                        "to userspace: [%*s]\n", rc,
-                                        decoded_name);
-                        if (copy_to_user(buf, decoded_name, rc))
-                                rc = -EFAULT;
                }
-                kfree(decoded_name);
+                rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
-                fsstack_copy_attr_atime(dentry->d_inode,
+                if (rc)
-                                        lower_dentry->d_inode);
+                        rc = -EFAULT;
+                else
+                        rc = plaintext_name_size;
+                kfree(plaintext_name);
+                fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
        }
 out_free_lower_buf:
        kfree(lower_buf);
@@ -670,13 +710,12 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        }
        old_fs = get_fs();
        set_fs(get_ds());
-        ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-                        "dentry->d_name.name = [%s]\n", dentry->d_name.name);
        rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
-        buf[rc] = '\0';
        set_fs(old_fs);
        if (rc < 0)
                goto out_free;
+        else
+                buf[rc] = '\0';
        rc = 0;
        nd_set_link(nd, buf);
        goto out;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 0d713b691941..ff539420cc6f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
        /* verify that everything through the encrypted FEK size is present */
        if (message_len < 4) {
                rc = -EIO;
-                printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable "
+                printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
                       "message length is [%d]\n", __func__, message_len, 4);
                goto out;
        }
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
        i += data_len;
        if (message_len < (i + key_rec->enc_key_size)) {
                rc = -EIO;
-                printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n",
+                printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
                       __func__, message_len, (i + key_rec->enc_key_size));
                goto out;
        }
        if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
                rc = -EIO;
-                printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than "
+                printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
                       "the maximum key size [%d]\n", __func__,
                       key_rec->enc_key_size,
                       ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
@@ -403,6 +403,580 @@ out:
 }
 static int
+ecryptfs_find_global_auth_tok_for_sig(
+        struct ecryptfs_global_auth_tok **global_auth_tok,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
+{
+        struct ecryptfs_global_auth_tok *walker;
+        int rc = 0;
+        (*global_auth_tok) = NULL;
+        mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+        list_for_each_entry(walker,
+                            &mount_crypt_stat->global_auth_tok_list,
+                            mount_crypt_stat_list) {
+                if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
+                        (*global_auth_tok) = walker;
+                        goto out;
+                }
+        }
+        rc = -EINVAL;
+out:
+        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+        return rc;
+}
+/**
+ * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok: Set to the matching auth_tok; NULL if not found
+ * @crypt_stat: inode crypt_stat crypto context
+ * @sig: Sig of auth_tok to find
+ *
+ * For now, this function simply looks at the registered auth_tok's
+ * linked off the mount_crypt_stat, so all the auth_toks that can be
+ * used must be registered at mount time. This function could
+ * potentially try a lot harder to find auth_tok's (e.g., by calling
+ * out to ecryptfsd to dynamically retrieve an auth_tok object) so
+ * that static registration of auth_tok's will no longer be necessary.
+ *
+ * Returns zero on no error; non-zero on error
+ */
+static int
+ecryptfs_find_auth_tok_for_sig(
+        struct ecryptfs_auth_tok **auth_tok,
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+        char *sig)
+{
+        struct ecryptfs_global_auth_tok *global_auth_tok;
+        int rc = 0;
+        (*auth_tok) = NULL;
+        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
+                                                  mount_crypt_stat, sig)) {
+                struct key *auth_tok_key;
+                rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+                                                       sig);
+        } else
+                (*auth_tok) = global_auth_tok->global_auth_tok;
+        return rc;
+}
+/**
+ * write_tag_70_packet can gobble a lot of stack space. We stuff most
+ * of the function's parameters in a kmalloc'd struct to help reduce
+ * eCryptfs' overall stack usage.
+ */
+struct ecryptfs_write_tag_70_packet_silly_stack {
+        u8 cipher_code;
+        size_t max_packet_size;
+        size_t packet_size_len;
+        size_t block_aligned_filename_size;
+        size_t block_size;
+        size_t i;
+        size_t j;
+        size_t num_rand_bytes;
+        struct mutex *tfm_mutex;
+        char *block_aligned_filename;
+        struct ecryptfs_auth_tok *auth_tok;
+        struct scatterlist src_sg;
+        struct scatterlist dst_sg;
+        struct blkcipher_desc desc;
+        char iv[ECRYPTFS_MAX_IV_BYTES];
+        char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+        char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+        struct hash_desc hash_desc;
+        struct scatterlist hash_sg;
+};
+/**
+ * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
+ * @filename: NULL-terminated filename string
+ *
+ * This is the simplest mechanism for achieving filename encryption in
+ * eCryptfs. It encrypts the given filename with the mount-wide
+ * filename encryption key (FNEK) and stores it in a packet to @dest,
+ * which the callee will encode and write directly into the dentry
+ * name.
+ */
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *filename, size_t filename_size)
+{
+        struct ecryptfs_write_tag_70_packet_silly_stack *s;
+        int rc = 0;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s) {
+                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                goto out;
+        }
+        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        (*packet_size) = 0;
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
+                &s->desc.tfm,
+                &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
+        if (unlikely(rc)) {
+                printk(KERN_ERR "Internal error whilst attempting to get "
+                       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+                       mount_crypt_stat->global_default_fn_cipher_name, rc);
+                goto out;
+        }
+        mutex_lock(s->tfm_mutex);
+        s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+        /* Plus one for the \0 separator between the random prefix
+         * and the plaintext filename */
+        s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
+        s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
+        if ((s->block_aligned_filename_size % s->block_size) != 0) {
+                s->num_rand_bytes += (s->block_size
+                                      - (s->block_aligned_filename_size
+                                         % s->block_size));
+                s->block_aligned_filename_size = (s->num_rand_bytes
+                                                  + filename_size);
+        }
+        /* Octet 0: Tag 70 identifier
+         * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+         *              and block-aligned encrypted filename size)
+         * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+         * Octet N2-N3: Cipher identifier (1 octet)
+         * Octets N3-N4: Block-aligned encrypted filename
+         *  - Consists of a minimum number of random characters, a \0
+         *    separator, and then the filename */
+        s->max_packet_size = (1                   /* Tag 70 identifier */
+                              + 3                 /* Max Tag 70 packet size */
+                              + ECRYPTFS_SIG_SIZE /* FNEK sig */
+                              + 1                 /* Cipher identifier */
+                              + s->block_aligned_filename_size);
+        if (dest == NULL) {
+                (*packet_size) = s->max_packet_size;
+                goto out_unlock;
+        }
+        if (s->max_packet_size > (*remaining_bytes)) {
+                printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
+                       "[%zd] available\n", __func__, s->max_packet_size,
+                       (*remaining_bytes));
+                rc = -EINVAL;
+                goto out_unlock;
+        }
+        s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
+                                            GFP_KERNEL);
+        if (!s->block_aligned_filename) {
+                printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+                       "kzalloc [%zd] bytes\n", __func__,
+                       s->block_aligned_filename_size);
+                rc = -ENOMEM;
+                goto out_unlock;
+        }
+        s->i = 0;
+        dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
+        rc = ecryptfs_write_packet_length(&dest[s->i],
+                                          (ECRYPTFS_SIG_SIZE
+                                           + 1 /* Cipher code */
+                                           + s->block_aligned_filename_size),
+                                          &s->packet_size_len);
+        if (rc) {
+                printk(KERN_ERR "%s: Error generating tag 70 packet "
+                       "header; cannot generate packet length; rc = [%d]\n",
+                       __func__, rc);
+                goto out_free_unlock;
+        }
+        s->i += s->packet_size_len;
+        ecryptfs_from_hex(&dest[s->i],
+                          mount_crypt_stat->global_default_fnek_sig,
+                          ECRYPTFS_SIG_SIZE);
+        s->i += ECRYPTFS_SIG_SIZE;
+        s->cipher_code = ecryptfs_code_for_cipher_string(
+                mount_crypt_stat->global_default_fn_cipher_name,
+                mount_crypt_stat->global_default_fn_cipher_key_bytes);
+        if (s->cipher_code == 0) {
+                printk(KERN_WARNING "%s: Unable to generate code for "
+                       "cipher [%s] with key bytes [%zd]\n", __func__,
+                       mount_crypt_stat->global_default_fn_cipher_name,
+                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                rc = -EINVAL;
+                goto out_free_unlock;
+        }
+        dest[s->i++] = s->cipher_code;
+        rc = ecryptfs_find_auth_tok_for_sig(
+                &s->auth_tok, mount_crypt_stat,
+                mount_crypt_stat->global_default_fnek_sig);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to find auth tok for "
+                       "fnek sig [%s]; rc = [%d]\n", __func__,
+                       mount_crypt_stat->global_default_fnek_sig, rc);
+                goto out_free_unlock;
+        }
+        /* TODO: Support other key modules than passphrase for
+         * filename encryption */
+        BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+        sg_init_one(
+                &s->hash_sg,
+                (u8 *)s->auth_tok->token.password.session_key_encryption_key,
+                s->auth_tok->token.password.session_key_encryption_key_bytes);
+        s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
+                                             CRYPTO_ALG_ASYNC);
+        if (IS_ERR(s->hash_desc.tfm)) {
+                        rc = PTR_ERR(s->hash_desc.tfm);
+                        printk(KERN_ERR "%s: Error attempting to "
+                               "allocate hash crypto context; rc = [%d]\n",
+                               __func__, rc);
+                        goto out_free_unlock;
+        }
+        rc = crypto_hash_init(&s->hash_desc);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error initializing crypto hash; rc = [%d]\n",
+                       __func__, rc);
+                goto out_release_free_unlock;
+        }
+        rc = crypto_hash_update(
+                &s->hash_desc, &s->hash_sg,
+                s->auth_tok->token.password.session_key_encryption_key_bytes);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error updating crypto hash; rc = [%d]\n",
+                       __func__, rc);
+                goto out_release_free_unlock;
+        }
+        rc = crypto_hash_final(&s->hash_desc, s->hash);
+        if (rc) {
+                printk(KERN_ERR
+                       "%s: Error finalizing crypto hash; rc = [%d]\n",
+                       __func__, rc);
+                goto out_release_free_unlock;
+        }
+        for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
+                s->block_aligned_filename[s->j] =
+                        s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
+                if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
+                    == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
+                        sg_init_one(&s->hash_sg, (u8 *)s->hash,
+                                    ECRYPTFS_TAG_70_DIGEST_SIZE);
+                        rc = crypto_hash_init(&s->hash_desc);
+                        if (rc) {
+                                printk(KERN_ERR
+                                       "%s: Error initializing crypto hash; "
+                                       "rc = [%d]\n", __func__, rc);
+                                goto out_release_free_unlock;
+                        }
+                        rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
+                                                ECRYPTFS_TAG_70_DIGEST_SIZE);
+                        if (rc) {
+                                printk(KERN_ERR
+                                       "%s: Error updating crypto hash; "
+                                       "rc = [%d]\n", __func__, rc);
+                                goto out_release_free_unlock;
+                        }
+                        rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
+                        if (rc) {
+                                printk(KERN_ERR
+                                       "%s: Error finalizing crypto hash; "
+                                       "rc = [%d]\n", __func__, rc);
+                                goto out_release_free_unlock;
+                        }
+                        memcpy(s->hash, s->tmp_hash,
+                               ECRYPTFS_TAG_70_DIGEST_SIZE);
+                }
+                if (s->block_aligned_filename[s->j] == '\0')
+                        s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
+        }
+        memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
+               filename_size);
+        rc = virt_to_scatterlist(s->block_aligned_filename,
+                                 s->block_aligned_filename_size, &s->src_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_release_free_unlock;
+        }
+        rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
+                                 &s->dst_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert encrypted filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_release_free_unlock;
+        }
+        /* The characters in the first block effectively do the job
+         * of the IV here, so we just use 0's for the IV. Note the
+         * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+         * >= ECRYPTFS_MAX_IV_BYTES. */
+        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+        s->desc.info = s->iv;
+        rc = crypto_blkcipher_setkey(
+                s->desc.tfm,
+                s->auth_tok->token.password.session_key_encryption_key,
+                mount_crypt_stat->global_default_fn_cipher_key_bytes);
+        if (rc < 0) {
+                printk(KERN_ERR "%s: Error setting key for crypto context; "
+                       "rc = [%d]. s->auth_tok->token.password.session_key_"
+                       "encryption_key = [0x%p]; mount_crypt_stat->"
+                       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+                       rc,
+                       s->auth_tok->token.password.session_key_encryption_key,
+                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                goto out_release_free_unlock;
+        }
+        rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+                                         s->block_aligned_filename_size);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to encrypt filename; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out_release_free_unlock;
+        }
+        s->i += s->block_aligned_filename_size;
+        (*packet_size) = s->i;
+        (*remaining_bytes) -= (*packet_size);
+out_release_free_unlock:
+        crypto_free_hash(s->hash_desc.tfm);
+out_free_unlock:
+        memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
+        kfree(s->block_aligned_filename);
+out_unlock:
+        mutex_unlock(s->tfm_mutex);
+out:
+        kfree(s);
+        return rc;
+}
+struct ecryptfs_parse_tag_70_packet_silly_stack {
+        u8 cipher_code;
+        size_t max_packet_size;
+        size_t packet_size_len;
+        size_t parsed_tag_70_packet_size;
+        size_t block_aligned_filename_size;
+        size_t block_size;
+        size_t i;
+        struct mutex *tfm_mutex;
+        char *decrypted_filename;
+        struct ecryptfs_auth_tok *auth_tok;
+        struct scatterlist src_sg;
+        struct scatterlist dst_sg;
+        struct blkcipher_desc desc;
+        char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
+        char iv[ECRYPTFS_MAX_IV_BYTES];
+        char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+};
+/**
+ * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * @filename: This function kmalloc's the memory for the filename
+ * @filename_size: This function sets this to the amount of memory
+ *                 kmalloc'd for the filename
+ * @packet_size: This function sets this to the the number of octets
+ *               in the packet parsed
+ * @mount_crypt_stat: The mount-wide cryptographic context
+ * @data: The memory location containing the start of the tag 70
+ *        packet
+ * @max_packet_size: The maximum legal size of the packet to be parsed
+ *                   from @data
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+                             size_t *packet_size,
+                             struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+                             char *data, size_t max_packet_size)
+{
+        struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+        int rc = 0;
+        (*packet_size) = 0;
+        (*filename_size) = 0;
+        (*filename) = NULL;
+        s = kmalloc(sizeof(*s), GFP_KERNEL);
+        if (!s) {
+                printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+                       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+                goto out;
+        }
+        s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+        if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+                printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
+                       "at least [%d]\n", __func__, max_packet_size,
+                        (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+                rc = -EINVAL;
+                goto out;
+        }
+        /* Octet 0: Tag 70 identifier
+         * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+         *              and block-aligned encrypted filename size)
+         * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+         * Octet N2-N3: Cipher identifier (1 octet)
+         * Octets N3-N4: Block-aligned encrypted filename
+         *  - Consists of a minimum number of random numbers, a \0
+         *    separator, and then the filename */
+        if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
+                printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
+                       "tag [0x%.2x]\n", __func__,
+                       data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
+                rc = -EINVAL;
+                goto out;
+        }
+        rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
+                                          &s->parsed_tag_70_packet_size,
+                                          &s->packet_size_len);
+        if (rc) {
+                printk(KERN_WARNING "%s: Error parsing packet length; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out;
+        }
+        s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
+                                          - ECRYPTFS_SIG_SIZE - 1);
+        if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
+            > max_packet_size) {
+                printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
+                       "size is [%zd]\n", __func__, max_packet_size,
+                       (1 + s->packet_size_len + 1
+                        + s->block_aligned_filename_size));
+                rc = -EINVAL;
+                goto out;
+        }
+        (*packet_size) += s->packet_size_len;
+        ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
+                        ECRYPTFS_SIG_SIZE);
+        s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+        (*packet_size) += ECRYPTFS_SIG_SIZE;
+        s->cipher_code = data[(*packet_size)++];
+        rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
+        if (rc) {
+                printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
+                       __func__, s->cipher_code);
+                goto out;
+        }
+        rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+                                                        &s->tfm_mutex,
+                                                        s->cipher_string);
+        if (unlikely(rc)) {
+                printk(KERN_ERR "Internal error whilst attempting to get "
+                       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+                       s->cipher_string, rc);
+                goto out;
+        }
+        mutex_lock(s->tfm_mutex);
+        rc = virt_to_scatterlist(&data[(*packet_size)],
+                                 s->block_aligned_filename_size, &s->src_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert encrypted filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_unlock;
+        }
+        (*packet_size) += s->block_aligned_filename_size;
+        s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
+                                        GFP_KERNEL);
+        if (!s->decrypted_filename) {
+                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+                       "kmalloc [%zd] bytes\n", __func__,
+                       s->block_aligned_filename_size);
+                rc = -ENOMEM;
+                goto out_unlock;
+        }
+        rc = virt_to_scatterlist(s->decrypted_filename,
+                                 s->block_aligned_filename_size, &s->dst_sg, 1);
+        if (rc != 1) {
+                printk(KERN_ERR "%s: Internal error whilst attempting to "
+                       "convert decrypted filename memory to scatterlist; "
+                       "expected rc = 1; got rc = [%d]. "
+                       "block_aligned_filename_size = [%zd]\n", __func__, rc,
+                       s->block_aligned_filename_size);
+                goto out_free_unlock;
+        }
+        /* The characters in the first block effectively do the job of
+         * the IV here, so we just use 0's for the IV. Note the
+         * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+         * >= ECRYPTFS_MAX_IV_BYTES. */
+        memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+        s->desc.info = s->iv;
+        rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+                                            s->fnek_sig_hex);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to find auth tok for "
+                       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
+                       rc);
+                goto out_free_unlock;
+        }
+        /* TODO: Support other key modules than passphrase for
+         * filename encryption */
+        BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+        rc = crypto_blkcipher_setkey(
+                s->desc.tfm,
+                s->auth_tok->token.password.session_key_encryption_key,
+                mount_crypt_stat->global_default_fn_cipher_key_bytes);
+        if (rc < 0) {
+                printk(KERN_ERR "%s: Error setting key for crypto context; "
+                       "rc = [%d]. s->auth_tok->token.password.session_key_"
+                       "encryption_key = [0x%p]; mount_crypt_stat->"
+                       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
+                       rc,
+                       s->auth_tok->token.password.session_key_encryption_key,
+                       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                goto out_free_unlock;
+        }
+        rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+                                         s->block_aligned_filename_size);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to decrypt filename; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out_free_unlock;
+        }
+        s->i = 0;
+        while (s->decrypted_filename[s->i] != '\0'
+               && s->i < s->block_aligned_filename_size)
+                s->i++;
+        if (s->i == s->block_aligned_filename_size) {
+                printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
+                       "find valid separator between random characters and "
+                       "the filename\n", __func__);
+                rc = -EINVAL;
+                goto out_free_unlock;
+        }
+        s->i++;
+        (*filename_size) = (s->block_aligned_filename_size - s->i);
+        if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
+                printk(KERN_WARNING "%s: Filename size is [%zd], which is "
+                       "invalid\n", __func__, (*filename_size));
+                rc = -EINVAL;
+                goto out_free_unlock;
+        }
+        (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
+        if (!(*filename)) {
+                printk(KERN_ERR "%s: Out of memory whilst attempting to "
+                       "kmalloc [%zd] bytes\n", __func__,
+                       ((*filename_size) + 1));
+                rc = -ENOMEM;
+                goto out_free_unlock;
+        }
+        memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
+        (*filename)[(*filename_size)] = '\0';
+out_free_unlock:
+        kfree(s->decrypted_filename);
+out_unlock:
+        mutex_unlock(s->tfm_mutex);
+out:
+        if (rc) {
+                (*packet_size) = 0;
+                (*filename_size) = 0;
+                (*filename) = NULL;
+        }
+        kfree(s);
+        return rc;
+}
+static int
 ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
 {
        int rc = 0;
@@ -897,30 +1471,6 @@ out:
        return rc;
 }
-static int
-ecryptfs_find_global_auth_tok_for_sig(
-        struct ecryptfs_global_auth_tok **global_auth_tok,
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
-{
-        struct ecryptfs_global_auth_tok *walker;
-        int rc = 0;
-        (*global_auth_tok) = NULL;
-        mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
-        list_for_each_entry(walker,
-                            &mount_crypt_stat->global_auth_tok_list,
-                            mount_crypt_stat_list) {
-                if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
-                        (*global_auth_tok) = walker;
-                        goto out;
-                }
-        }
-        rc = -EINVAL;
-out:
-        mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
-        return rc;
-}
 /**
 * ecryptfs_verify_version
 * @version: The version number to confirm
@@ -990,43 +1540,6 @@ out:
 }
 /**
- * ecryptfs_find_auth_tok_for_sig
- * @auth_tok: Set to the matching auth_tok; NULL if not found
- * @crypt_stat: inode crypt_stat crypto context
- * @sig: Sig of auth_tok to find
- *
- * For now, this function simply looks at the registered auth_tok's
- * linked off the mount_crypt_stat, so all the auth_toks that can be
- * used must be registered at mount time. This function could
- * potentially try a lot harder to find auth_tok's (e.g., by calling
- * out to ecryptfsd to dynamically retrieve an auth_tok object) so
- * that static registration of auth_tok's will no longer be necessary.
- *
- * Returns zero on no error; non-zero on error
- */
-static int
-ecryptfs_find_auth_tok_for_sig(
-        struct ecryptfs_auth_tok **auth_tok,
-        struct ecryptfs_crypt_stat *crypt_stat, char *sig)
-{
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-                crypt_stat->mount_crypt_stat;
-        struct ecryptfs_global_auth_tok *global_auth_tok;
-        int rc = 0;
-        (*auth_tok) = NULL;
-        if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
-                                                  mount_crypt_stat, sig)) {
-                struct key *auth_tok_key;
-                rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
-                                                       sig);
-        } else
-                (*auth_tok) = global_auth_tok->global_auth_tok;
-        return rc;
-}
-/**
 * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
 * @auth_tok: The passphrase authentication token to use to encrypt the FEK
 * @crypt_stat: The cryptographic context
@@ -1256,7 +1769,8 @@ find_next_matching_auth_tok:
                        rc = -EINVAL;
                        goto out_wipe_list;
                }
-                ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat,
+                ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+                                               crypt_stat->mount_crypt_stat,
                                               candidate_auth_tok_sig);
                if (matching_auth_tok) {
                        found_auth_tok = 1;
@@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
        int rc;
        rc = write_tag_66_packet(auth_tok->token.private_key.signature,
-                                 ecryptfs_code_for_cipher_string(crypt_stat),
+                                 ecryptfs_code_for_cipher_string(
+                                         crypt_stat->cipher,
+                                         crypt_stat->key_size),
                                 crypt_stat, &payload, &payload_len);
        if (rc) {
                ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
@@ -1696,7 +2212,8 @@ encrypted_session_key_set:
        dest[(*packet_size)++] = 0x04; /* version 4 */
        /* TODO: Break from RFC2440 so that arbitrary ciphers can be
         * specified with strings */
-        cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
+        cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
+                                                      crypt_stat->key_size);
        if (cipher_code == 0) {
                ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
                                "cipher [%s]\n", crypt_stat->cipher);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fd630713c5c7..789cf2e1be1e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
       ecryptfs_opt_ecryptfs_key_bytes,
       ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
-       ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
+       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
+       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
+       ecryptfs_opt_err };
 static const match_table_t tokens = {
        {ecryptfs_opt_sig, "sig=%s"},
@@ -217,6 +219,9 @@ static const match_table_t tokens = {
        {ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
        {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
        {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
+        {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
+        {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
+        {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
        {ecryptfs_opt_err, NULL}
 };
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        int rc = 0;
        int sig_set = 0;
        int cipher_name_set = 0;
+        int fn_cipher_name_set = 0;
        int cipher_key_bytes;
        int cipher_key_bytes_set = 0;
+        int fn_cipher_key_bytes;
+        int fn_cipher_key_bytes_set = 0;
        struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
                &ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
        substring_t args[MAX_OPT_ARGS];
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
        char *sig_src;
        char *cipher_name_dst;
        char *cipher_name_src;
+        char *fn_cipher_name_dst;
+        char *fn_cipher_name_src;
+        char *fnek_dst;
+        char *fnek_src;
        char *cipher_key_bytes_src;
+        char *fn_cipher_key_bytes_src;
        if (!options) {
                rc = -EINVAL;
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                                global_default_cipher_name;
                        strncpy(cipher_name_dst, cipher_name_src,
                                ECRYPTFS_MAX_CIPHER_NAME_SIZE);
-                        ecryptfs_printk(KERN_DEBUG,
+                        cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
-                                        "The mount_crypt_stat "
-                                        "global_default_cipher_name set to: "
-                                        "[%s]\n", cipher_name_dst);
                        cipher_name_set = 1;
                        break;
                case ecryptfs_opt_ecryptfs_key_bytes:
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                                                   &cipher_key_bytes_src, 0);
                        mount_crypt_stat->global_default_cipher_key_size =
                                cipher_key_bytes;
-                        ecryptfs_printk(KERN_DEBUG,
-                                        "The mount_crypt_stat "
-                                        "global_default_cipher_key_size "
-                                        "set to: [%d]\n", mount_crypt_stat->
-                                        global_default_cipher_key_size);
                        cipher_key_bytes_set = 1;
                        break;
                case ecryptfs_opt_passthrough:
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                        mount_crypt_stat->flags |=
                                ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
                        break;
+                case ecryptfs_opt_fnek_sig:
+                        fnek_src = args[0].from;
+                        fnek_dst =
+                                mount_crypt_stat->global_default_fnek_sig;
+                        strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
+                        mount_crypt_stat->global_default_fnek_sig[
+                                ECRYPTFS_SIG_SIZE_HEX] = '\0';
+                        rc = ecryptfs_add_global_auth_tok(
+                                mount_crypt_stat,
+                                mount_crypt_stat->global_default_fnek_sig);
+                        if (rc) {
+                                printk(KERN_ERR "Error attempting to register "
+                                       "global fnek sig [%s]; rc = [%d]\n",
+                                       mount_crypt_stat->global_default_fnek_sig,
+                                       rc);
+                                goto out;
+                        }
+                        mount_crypt_stat->flags |=
+                                (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
+                                 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
+                        break;
+                case ecryptfs_opt_fn_cipher:
+                        fn_cipher_name_src = args[0].from;
+                        fn_cipher_name_dst =
+                                mount_crypt_stat->global_default_fn_cipher_name;
+                        strncpy(fn_cipher_name_dst, fn_cipher_name_src,
+                                ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+                        mount_crypt_stat->global_default_fn_cipher_name[
+                                ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+                        fn_cipher_name_set = 1;
+                        break;
+                case ecryptfs_opt_fn_cipher_key_bytes:
+                        fn_cipher_key_bytes_src = args[0].from;
+                        fn_cipher_key_bytes =
+                                (int)simple_strtol(fn_cipher_key_bytes_src,
+                                                   &fn_cipher_key_bytes_src, 0);
+                        mount_crypt_stat->global_default_fn_cipher_key_bytes =
+                                fn_cipher_key_bytes;
+                        fn_cipher_key_bytes_set = 1;
+                        break;
                case ecryptfs_opt_err:
                default:
-                        ecryptfs_printk(KERN_WARNING,
+                        printk(KERN_WARNING
-                                        "eCryptfs: unrecognized option '%s'\n",
+                               "%s: eCryptfs: unrecognized option [%s]\n",
-                                        p);
+                               __func__, p);
                }
        }
        if (!sig_set) {
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
                int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
                BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
                strcpy(mount_crypt_stat->global_default_cipher_name,
                       ECRYPTFS_DEFAULT_CIPHER);
        }
-        if (!cipher_key_bytes_set) {
+        if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+            && !fn_cipher_name_set)
+                strcpy(mount_crypt_stat->global_default_fn_cipher_name,
+                       mount_crypt_stat->global_default_cipher_name);
+        if (!cipher_key_bytes_set)
                mount_crypt_stat->global_default_cipher_key_size = 0;
-        }
+        if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+            && !fn_cipher_key_bytes_set)
+                mount_crypt_stat->global_default_fn_cipher_key_bytes =
+                        mount_crypt_stat->global_default_cipher_key_size;
        mutex_lock(&key_tfm_list_mutex);
        if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
-                                 NULL))
+                                 NULL)) {
                rc = ecryptfs_add_new_key_tfm(
                        NULL, mount_crypt_stat->global_default_cipher_name,
                        mount_crypt_stat->global_default_cipher_key_size);
-        mutex_unlock(&key_tfm_list_mutex);
+                if (rc) {
-        if (rc) {
+                        printk(KERN_ERR "Error attempting to initialize "
-                printk(KERN_ERR "Error attempting to initialize cipher with "
+                               "cipher with name = [%s] and key size = [%td]; "
-                       "name = [%s] and key size = [%td]; rc = [%d]\n",
+                               "rc = [%d]\n",
-                       mount_crypt_stat->global_default_cipher_name,
+                               mount_crypt_stat->global_default_cipher_name,
-                       mount_crypt_stat->global_default_cipher_key_size, rc);
+                               mount_crypt_stat->global_default_cipher_key_size,
-                rc = -EINVAL;
+                               rc);
-                goto out;
+                        rc = -EINVAL;
+                        mutex_unlock(&key_tfm_list_mutex);
+                        goto out;
+                }
        }
+        if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+            && !ecryptfs_tfm_exists(
+                    mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
+                rc = ecryptfs_add_new_key_tfm(
+                        NULL, mount_crypt_stat->global_default_fn_cipher_name,
+                        mount_crypt_stat->global_default_fn_cipher_key_bytes);
+                if (rc) {
+                        printk(KERN_ERR "Error attempting to initialize "
+                               "cipher with name = [%s] and key size = [%td]; "
+                               "rc = [%d]\n",
+                               mount_crypt_stat->global_default_fn_cipher_name,
+                               mount_crypt_stat->global_default_fn_cipher_key_bytes,
+                               rc);
+                        rc = -EINVAL;
+                        mutex_unlock(&key_tfm_list_mutex);
+                        goto out;
+                }
+        }
+        mutex_unlock(&key_tfm_list_mutex);
        rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
-        if (rc) {
+        if (rc)
                printk(KERN_WARNING "One or more global auth toks could not "
                       "properly register; rc = [%d]\n", rc);
-        }
 out:
        return rc;
 }
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 6913f727624d..96ef51489e01 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
        (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
        if (!(*daemon)) {
                rc = -ENOMEM;
-                printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+                printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
                       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
                goto out;
        }
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
        msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
        if (!msg_ctx->msg) {
                rc = -ENOMEM;
-                printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+                printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
                       "GFP_KERNEL memory\n", __func__, msg_size);
                goto unlock;
        }
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index efd95a0ed1ea..a67fea655f49 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
                if (!msg_ctx->msg) {
                        rc = -ENOMEM;
                        printk(KERN_ERR "%s: Out of memory whilst attempting "
-                               "to kmalloc(%Zd, GFP_KERNEL)\n", __func__,
+                               "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
                               (sizeof(*msg_ctx->msg) + data_size));
                        goto out_unlock;
                }
@@ -322,7 +322,7 @@ check_list:
        if (count < total_length) {
                rc = 0;
                printk(KERN_WARNING "%s: Only given user buffer of "
-                       "size [%Zd], but we need [%Zd] to read the "
+                       "size [%zd], but we need [%zd] to read the "
                       "pending message\n", __func__, count, total_length);
                goto out_unlock_msg_ctx;
        }
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
        if ((sizeof(*msg) + msg->data_len) != data_size) {
                printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
-                       "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__,
+                       "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
                       (sizeof(*msg) + msg->data_len), data_size);
                rc = -EINVAL;
                goto out;
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        data = kmalloc(count, GFP_KERNEL);
        if (!data) {
                printk(KERN_ERR "%s: Out of memory whilst attempting to "
-                       "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count);
+                       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
                goto out;
        }
        rc = copy_from_user(data, buf, count);
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        case ECRYPTFS_MSG_RESPONSE:
                if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
                        printk(KERN_WARNING "%s: Minimum acceptable packet "
-                               "size is [%Zd], but amount of data written is "
+                               "size is [%zd], but amount of data written is "
-                               "only [%Zd]. Discarding response packet.\n",
+                               "only [%zd]. Discarding response packet.\n",
                               __func__,
                               (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
                               count);
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
                }
                i += packet_size_length;
                if ((1 + 4 + packet_size_length + packet_size) != count) {
-                        printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])"
+                        printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
-                               " + packet_size([%Zd]))([%Zd]) != "
+                               " + packet_size([%zd]))([%zd]) != "
-                               "count([%Zd]). Invalid packet format.\n",
+                               "count([%zd]). Invalid packet format.\n",
                               __func__, packet_size_length, packet_size,
                               (1 + packet_size_length + packet_size), count);
                        goto out_free;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 04d7b3fa1ac6..46cec2b69796 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
        loff_t prev_page_end_size;
        int rc = 0;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/exec.c b/fs/exec.c
index 911dd0fd7e09..605be573fe87 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -52,17 +52,13 @@
 #include <linux/audit.h>
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
 #include "internal.h"
-#ifdef __alpha__
-/* for /sbin/loader handling in search_binary_handler() */
-#include <linux/a.out.h>
-#endif
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 int suid_dumpable = 0;
@@ -128,7 +124,8 @@ asmlinkage long sys_uselib(const char __user * library)
        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
                goto exit;
-        error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+        error = inode_permission(nd.path.dentry->d_inode,
+                                 MAY_READ | MAY_EXEC | MAY_OPEN);
        if (error)
                goto exit;
@@ -137,6 +134,8 @@ asmlinkage long sys_uselib(const char __user * library)
        if (IS_ERR(file))
                goto out;
+        fsnotify_open(file->f_path.dentry);
        error = -ENOEXEC;
        if(file->f_op) {
                struct linux_binfmt * fmt;
@@ -234,13 +233,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 static int __bprm_mm_init(struct linux_binprm *bprm)
 {
-        int err = -ENOMEM;
+        int err;
        struct vm_area_struct *vma = NULL;
        struct mm_struct *mm = bprm->mm;
        bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
        if (!vma)
-                goto err;
+                return -ENOMEM;
        down_write(&mm->mmap_sem);
        vma->vm_mm = mm;
@@ -253,28 +252,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         */
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vma->vm_flags = VM_STACK_FLAGS;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        err = insert_vm_struct(mm, vma);
-        if (err) {
+        if (err)
-                up_write(&mm->mmap_sem);
                goto err;
-        }
        mm->stack_vm = mm->total_vm = 1;
        up_write(&mm->mmap_sem);
        bprm->p = vma->vm_end - sizeof(void *);
        return 0;
 err:
-        if (vma) {
+        up_write(&mm->mmap_sem);
-                bprm->vma = NULL;
+        bprm->vma = NULL;
-                kmem_cache_free(vm_area_cachep, vma);
+        kmem_cache_free(vm_area_cachep, vma);
-        }
        return err;
 }
@@ -681,7 +672,7 @@ struct file *open_exec(const char *name)
        if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
                goto out_path_put;
-        err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+        err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
        if (err)
                goto out_path_put;
@@ -689,6 +680,8 @@ struct file *open_exec(const char *name)
        if (IS_ERR(file))
                return file;
+        fsnotify_open(file->f_path.dentry);
        err = deny_write_access(file);
        if (err) {
                fput(file);
@@ -774,7 +767,6 @@ static int de_thread(struct task_struct *tsk)
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;
-        struct task_struct *leader = NULL;
        int count;
        if (thread_group_empty(tsk))
@@ -812,7 +804,7 @@ static int de_thread(struct task_struct *tsk)
         * and to assume its PID:
         */
        if (!thread_group_leader(tsk)) {
-                leader = tsk->group_leader;
+                struct task_struct *leader = tsk->group_leader;
                sig->notify_count = -1; /* for exit_notify() */
                for (;;) {
@@ -864,8 +856,9 @@ static int de_thread(struct task_struct *tsk)
                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;
                write_unlock_irq(&tasklist_lock);
+                release_task(leader);
        }
        sig->group_exit_task = NULL;
@@ -874,8 +867,6 @@ static int de_thread(struct task_struct *tsk)
 no_thread_group:
        exit_itimers(sig);
        flush_itimer_signals();
-        if (leader)
-                release_task(leader);
        if (atomic_read(&oldsighand->count) != 1) {
                struct sighand_struct *newsighand;
@@ -1181,41 +1172,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        unsigned int depth = bprm->recursion_depth;
        int try,retval;
        struct linux_binfmt *fmt;
-#ifdef __alpha__
-        /* handle /sbin/loader.. */
-        {
-            struct exec * eh = (struct exec *) bprm->buf;
-            if (!bprm->loader && eh->fh.f_magic == 0x183 &&
-                (eh->fh.f_flags & 0x3000) == 0x3000)
-            {
-                struct file * file;
-                unsigned long loader;
-                allow_write_access(bprm->file);
-                fput(bprm->file);
-                bprm->file = NULL;
-                loader = bprm->vma->vm_end - sizeof(void *);
-                file = open_exec("/sbin/loader");
-                retval = PTR_ERR(file);
-                if (IS_ERR(file))
-                        return retval;
-                /* Remember if the application is TASO.  */
-                bprm->taso = eh->ah.entry < 0x100000000UL;
-                bprm->file = file;
-                bprm->loader = loader;
-                retval = prepare_binprm(bprm);
-                if (retval<0)
-                        return retval;
-                /* should call search_binary_handler recursively here,
-                   but it does not matter */
-            }
-        }
-#endif
        retval = security_bprm_check(bprm);
        if (retval)
                return retval;
@@ -1737,7 +1694,7 @@ int get_dumpable(struct mm_struct *mm)
        return (ret >= 2) ? 2 : ret;
 }
-int do_coredump(long signr, int exit_code, struct pt_regs * regs)
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
        char corename[CORENAME_MAX_SIZE + 1];
@@ -1821,6 +1778,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
        if (ispipe) {
                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+                if (!helper_argv) {
+                        printk(KERN_WARNING "%s failed to allocate memory\n",
+                               __func__);
+                        goto fail_unlock;
+                }
                /* Terminate the string before the first option */
                delimit = strchr(corename, ' ');
                if (delimit)
@@ -1888,5 +1850,5 @@ fail_unlock:
        put_cred(cred);
        coredump_finish(mm);
 fail:
-        return retval;
+        return;
 }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 8d0add625870..66321a877e74 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
        inode->i_blocks = 0;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
        memset(ei->i_data, 0, sizeof(ei->i_data));
-        ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
-                ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
-        /* dirsync is only applied to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT2_DIRSYNC_FL;
        ei->i_faddr = 0;
        ei->i_frag_no = 0;
        ei->i_frag_size = 0;
@@ -585,7 +581,10 @@ got:
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EINVAL;
+                goto fail_drop;
+        }
        if (DQUOT_ALLOC_INODE(inode)) {
                err = -EDQUOT;
@@ -612,6 +611,7 @@ fail_drop:
        DQUOT_DROP(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
+        unlock_new_inode(inode);
        iput(inode);
        return ERR_PTR(err);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7658b33e2653..23fff2f87783 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -497,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
 * ext2_splice_branch - splice the allocated branch onto inode.
 * @inode: owner
 * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *      ext2_alloc_branch)
 * @where: location of missing link
 * @num:   number of indirect blocks we are adding
 * @blks:  number of direct blocks we are adding
@@ -1286,9 +1285,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
                else
                        inode->i_mapping->a_ops = &ext2_aops;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext2_inode_is_fast_symlink(inode))
+                if (ext2_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext2_fast_symlink_inode_operations;
-                else {
+                        nd_terminate_link(ei->i_data, inode->i_size,
+                                sizeof(ei->i_data) - 1);
+                } else {
                        inode->i_op = &ext2_symlink_inode_operations;
                        if (test_opt(inode->i_sb, NOBH))
                                inode->i_mapping->a_ops = &ext2_nobh_aops;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e1..7cb4badef927 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        goto setflags_out;
                }
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext2_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT2_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2a747252ec12..90ea17998a73 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
        int err = ext2_add_link(dentry, inode);
        if (!err) {
                d_instantiate(dentry, inode);
+                unlock_new_inode(inode);
                return 0;
        }
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -170,6 +172,7 @@ out:
 out_fail:
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput (inode);
        goto out;
 }
@@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        struct dentry *dentry)
 {
        struct inode *inode = old_dentry->d_inode;
+        int err;
        if (inode->i_nlink >= EXT2_LINK_MAX)
                return -EMLINK;
@@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
        inode_inc_link_count(inode);
        atomic_inc(&inode->i_count);
-        return ext2_add_nondir(dentry, inode);
+        err = ext2_add_link(dentry, inode);
+        if (!err) {
+                d_instantiate(dentry, inode);
+                return 0;
+        }
+        inode_dec_link_count(inode);
+        iput(inode);
+        return err;
 }
 static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
@@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
                goto out_fail;
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
 out:
        return err;
 out_fail:
        inode_dec_link_count(inode);
        inode_dec_link_count(inode);
+        unlock_new_inode(inode);
        iput(inode);
 out_dir:
        inode_dec_link_count(dir);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac87..da8bdeaa2e6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
        percpu_counter_destroy(&sbi->s_dirs_counter);
        brelse (sbi->s_sbh);
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_sb_block = sb_block;
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                printk ("EXT2-fs: not enough memory\n");
                goto failed_mount;
        }
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
        if (!sbi->s_debts) {
                printk ("EXT2-fs: not enough memory\n");
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index c30e149fbd2e..7d215b4d4f2e 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-        __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        const unsigned char *ucp = (const unsigned char *) name;
+        while (len--) {
+                hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+                if (hash & 0x80000000)
+                        hash -= 0x7fffffff;
+                hash1 = hash0;
+                hash0 = hash;
+        }
+        return hash0 << 1;
+}
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        const signed char *scp = (const signed char *) name;
        while (len--) {
-                __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+                hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
-                if (hash & 0x80000000) hash -= 0x7fffffff;
+                if (hash & 0x80000000)
+                        hash -= 0x7fffffff;
                hash1 = hash0;
                hash0 = hash;
        }
-        return (hash0 << 1);
+        return hash0 << 1;
 }
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 {
        __u32   pad, val;
        int     i;
+        const signed char *scp = (const signed char *) msg;
+        pad = (__u32)len | ((__u32)len << 8);
+        pad |= pad << 16;
+        val = pad;
+        if (len > num*4)
+                len = num * 4;
+        for (i = 0; i < len; i++) {
+                if ((i % 4) == 0)
+                        val = pad;
+                val = ((int) scp[i]) + (val << 8);
+                if ((i % 4) == 3) {
+                        *buf++ = val;
+                        val = pad;
+                        num--;
+                }
+        }
+        if (--num >= 0)
+                *buf++ = val;
+        while (--num >= 0)
+                *buf++ = pad;
+}
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+        __u32   pad, val;
+        int     i;
+        const unsigned char *ucp = (const unsigned char *) msg;
        pad = (__u32)len | ((__u32)len << 8);
        pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
        for (i=0; i < len; i++) {
                if ((i % 4) == 0)
                        val = pad;
-                val = msg[i] + (val << 8);
+                val = ((int) ucp[i]) + (val << 8);
                if ((i % 4) == 3) {
                        *buf++ = val;
                        val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        const char      *p;
        int             i;
        __u32           in[8], buf[4];
+        void            (*str2hashbuf)(const char *, int, __u32 *, int) =
+                                str2hashbuf_signed;
        /* Initialize the default seed for the hash checksum functions */
        buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        }
        switch (hinfo->hash_version) {
+        case DX_HASH_LEGACY_UNSIGNED:
+                hash = dx_hack_hash_unsigned(name, len);
+                break;
        case DX_HASH_LEGACY:
-                hash = dx_hack_hash(name, len);
+                hash = dx_hack_hash_signed(name, len);
                break;
+        case DX_HASH_HALF_MD4_UNSIGNED:
+                str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_HALF_MD4:
                p = name;
                while (len > 0) {
-                        str2hashbuf(p, len, in, 8);
+                        (*str2hashbuf)(p, len, in, 8);
                        half_md4_transform(buf, in);
                        len -= 32;
                        p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
                minor_hash = buf[2];
                hash = buf[1];
                break;
+        case DX_HASH_TEA_UNSIGNED:
+                str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_TEA:
                p = name;
                while (len > 0) {
-                        str2hashbuf(p, len, in, 4);
+                        (*str2hashbuf)(p, len, in, 4);
                        TEA_transform(buf, in);
                        len -= 16;
                        p += 16;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 490bd0ed7896..8de6c720e510 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
        ei->i_dir_start_lookup = 0;
        ei->i_disksize = 0;
-        ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
-                ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
-        /* dirsync only applies to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT3_DIRSYNC_FL;
 #ifdef EXT3_FRAGMENTS
        ei->i_faddr = 0;
        ei->i_frag_no = 0;
@@ -579,7 +575,10 @@ got:
        ext3_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
                handle->h_sync = 1;
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EINVAL;
+                goto fail_drop;
+        }
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
@@ -627,6 +626,7 @@ fail_drop:
        DQUOT_DROP(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
+        unlock_new_inode(inode);
        iput(inode);
        brelse(bitmap_bh);
        return ERR_PTR(err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f8424ad89971..5fa453b49a64 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -37,6 +37,7 @@
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "xattr.h"
 #include "acl.h"
@@ -1160,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
        to = from + len;
 retry:
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
@@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
                inode->i_op = &ext3_dir_inode_operations;
                inode->i_fop = &ext3_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext3_inode_is_fast_symlink(inode))
+                if (ext3_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext3_fast_symlink_inode_operations;
-                else {
+                        nd_terminate_link(ei->i_data, inode->i_size,
+                                sizeof(ei->i_data) - 1);
+                } else {
                        inode->i_op = &ext3_symlink_inode_operations;
                        ext3_set_aops(inode);
                }
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8e..5e86ce9a86e0 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
                        goto flags_out;
                }
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext3_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT3_DIRSYNC_FL;
                mutex_lock(&inode->i_mutex);
                /* Is it quota file? Do not allow user to mess with it */
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3e5edc92aa0b..69a3d19ca9fd 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
                goto fail;
        }
        hinfo->hash_version = root->info.hash_version;
+        if (hinfo->hash_version <= DX_HASH_TEA)
+                hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
        hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
        if (entry)
                ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        dir = dir_file->f_path.dentry->d_inode;
        if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
                hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+                if (hinfo.hash_version <= DX_HASH_TEA)
+                        hinfo.hash_version +=
+                                EXT3_SB(dir->i_sb)->s_hash_unsigned;
                hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                               start_hash, start_minor_hash);
@@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        u32 hash2;
        struct dx_map_entry *map;
        char *data1 = (*bh)->b_data, *data2;
-        unsigned split, move, size, i;
+        unsigned split, move, size;
        struct ext3_dir_entry_2 *de = NULL, *de2;
-        int     err = 0;
+        int     err = 0, i;
        bh2 = ext3_append (handle, dir, &newblock, &err);
        if (!(bh2)) {
@@ -1398,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* Initialize as for dx_probe */
        hinfo.hash_version = root->info.hash_version;
+        if (hinfo.hash_version <= DX_HASH_TEA)
+                hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
        hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
        ext3fs_dirhash(name, namelen, &hinfo);
        frame = frames;
@@ -1652,9 +1655,11 @@ static int ext3_add_nondir(handle_t *handle,
        if (!err) {
                ext3_mark_inode_dirty(handle, inode);
                d_instantiate(dentry, inode);
+                unlock_new_inode(inode);
                return 0;
        }
        drop_nlink(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -1765,6 +1770,7 @@ retry:
        dir_block = ext3_bread (handle, inode, 0, 1, &err);
        if (!dir_block) {
                drop_nlink(inode); /* is this nlink == 0? */
+                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
                iput (inode);
                goto out_stop;
@@ -1792,6 +1798,7 @@ retry:
        err = ext3_add_entry (handle, dentry, inode);
        if (err) {
                inode->i_nlink = 0;
+                unlock_new_inode(inode);
                ext3_mark_inode_dirty(handle, inode);
                iput (inode);
                goto out_stop;
@@ -1800,6 +1807,7 @@ retry:
        ext3_update_dx_flag(dir);
        ext3_mark_inode_dirty(handle, dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
 out_stop:
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
@@ -2170,10 +2178,10 @@ retry:
                 * We have a transaction open.  All is sweetness.  It also sets
                 * i_size in generic_commit_write().
                 */
-                err = __page_symlink(inode, symname, l,
+                err = __page_symlink(inode, symname, l, 1);
-                                mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
                if (err) {
                        drop_nlink(inode);
+                        unlock_new_inode(inode);
                        ext3_mark_inode_dirty(handle, inode);
                        iput (inode);
                        goto out_stop;
@@ -2221,7 +2229,14 @@ retry:
        inc_nlink(inode);
        atomic_inc(&inode->i_count);
-        err = ext3_add_nondir(handle, dentry, inode);
+        err = ext3_add_entry(handle, dentry, inode);
+        if (!err) {
+                ext3_mark_inode_dirty(handle, inode);
+                d_instantiate(dentry, inode);
+        } else {
+                drop_nlink(inode);
+                iput(inode);
+        }
        ext3_journal_stop(handle);
        if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec1..b70d90e08a3c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
                             unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
                               unsigned int);
-static void ext3_commit_super (struct super_block * sb,
+static int ext3_commit_super(struct super_block *sb,
-                               struct ext3_super_block * es,
+                               struct ext3_super_block *es,
                               int sync);
 static void ext3_mark_recovery_complete(struct super_block * sb,
                                        struct ext3_super_block * es);
@@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
                                     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
-static void ext3_unlockfs(struct super_block *sb);
+static int ext3_unfreeze(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
-static void ext3_write_super_lockfs(struct super_block *sb);
+static int ext3_freeze(struct super_block *sb);
 /*
 * Wrappers for journal_start/end.
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
                ext3_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
 }
@@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
                                    ext3_nfs_get_inode);
 }
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+                                 gfp_t wait)
+{
+        journal_t *journal = EXT3_SB(sb)->s_journal;
+        WARN_ON(PageChecked(page));
+        if (!page_has_buffers(page))
+                return 0;
+        if (journal)
+                return journal_try_to_free_buffers(journal, page, 
+                                                   wait & ~__GFP_WAIT);
+        return try_to_free_buffers(page);
+}
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -713,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = {
        .acquire_dquot  = ext3_acquire_dquot,
        .release_dquot  = ext3_release_dquot,
        .mark_dirty     = ext3_mark_dquot_dirty,
-        .write_info     = ext3_write_info
+        .write_info     = ext3_write_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
 static struct quotactl_ops ext3_qctl_operations = {
@@ -736,8 +759,8 @@ static const struct super_operations ext3_sops = {
        .put_super      = ext3_put_super,
        .write_super    = ext3_write_super,
        .sync_fs        = ext3_sync_fs,
-        .write_super_lockfs = ext3_write_super_lockfs,
+        .freeze_fs      = ext3_freeze,
-        .unlockfs       = ext3_unlockfs,
+        .unfreeze_fs    = ext3_unfreeze,
        .statfs         = ext3_statfs,
        .remount_fs     = ext3_remount,
        .clear_inode    = ext3_clear_inode,
@@ -746,6 +769,7 @@ static const struct super_operations ext3_sops = {
        .quota_read     = ext3_quota_read,
        .quota_write    = ext3_quota_write,
 #endif
+        .bdev_try_to_free_page = bdev_try_to_free_page,
 };
 static const struct export_operations ext3_export_ops = {
@@ -1035,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb,
                case Opt_grpjquota:
                        qtype = GRPQUOTA;
 set_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
                                        "EXT3-fs: Cannot change journaled "
@@ -1075,8 +1098,7 @@ set_qf_name:
                case Opt_offgrpjquota:
                        qtype = GRPQUOTA;
 clear_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR "EXT3-fs: Cannot change "
                                        "journaled quota options when "
@@ -1095,8 +1117,7 @@ clear_qf_name:
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
 set_qf_format:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_jquota_fmt != qfmt) {
                                printk(KERN_ERR "EXT3-fs: Cannot change "
                                        "journaled quota options when "
@@ -1115,8 +1136,7 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
-                        if (sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb)) {
-                            sb_any_quota_suspended(sb)) {
                                printk(KERN_ERR "EXT3-fs: Cannot change quota "
                                        "options when quota turned on.\n");
                                return 0;
@@ -1548,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1744,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        for (i=0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
+        i = le32_to_cpu(es->s_flags);
+        if (i & EXT2_FLAGS_UNSIGNED_HASH)
+                sbi->s_hash_unsigned = 3;
+        else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+                es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+                sbi->s_hash_unsigned = 3;
+#else
+                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+                sb->s_dirt = 1;
+        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
                printk (KERN_ERR
@@ -1788,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logic_sb_block, i);
@@ -2272,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb,
        return 0;
 }
-static void ext3_commit_super (struct super_block * sb,
+static int ext3_commit_super(struct super_block *sb,
-                               struct ext3_super_block * es,
+                               struct ext3_super_block *es,
                               int sync)
 {
        struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
+        int error = 0;
        if (!sbh)
-                return;
+                return error;
        es->s_wtime = cpu_to_le32(get_seconds());
        es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
        es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
        if (sync)
-                sync_dirty_buffer(sbh);
+                error = sync_dirty_buffer(sbh);
+        return error;
 }
@@ -2400,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
 */
-static void ext3_write_super_lockfs(struct super_block *sb)
+static int ext3_freeze(struct super_block *sb)
 {
+        int error = 0;
+        journal_t *journal;
        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY)) {
-                journal_t *journal = EXT3_SB(sb)->s_journal;
+                journal = EXT3_SB(sb)->s_journal;
                /* Now we set up the journal barrier. */
                journal_lock_updates(journal);
@@ -2414,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb)
                 * We don't want to clear needs_recovery flag when we failed
                 * to flush the journal.
                 */
-                if (journal_flush(journal) < 0)
+                error = journal_flush(journal);
-                        return;
+                if (error < 0)
+                        goto out;
                /* Journal blocked and flushed, clear needs_recovery flag. */
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-                ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+                error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+                if (error)
+                        goto out;
        }
+        return 0;
+out:
+        journal_unlock_updates(journal);
+        return error;
 }
 /*
 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
 * flag here, even though the filesystem is not technically dirty yet.
 */
-static void ext3_unlockfs(struct super_block *sb)
+static int ext3_unfreeze(struct super_block *sb)
 {
        if (!(sb->s_flags & MS_RDONLY)) {
                lock_super(sb);
@@ -2437,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb)
                unlock_super(sb);
                journal_unlock_updates(EXT3_SB(sb)->s_journal);
        }
+        return 0;
 }
 static int ext3_remount (struct super_block * sb, int * flags, char * data)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38b3acf5683b..6bba06b09dd1 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "group.h"
+#include "mballoc.h"
 /*
 * balloc.c contains the blocks allocation and deallocation routines
@@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 * essentially implementing a per-group read-only flag. */
                if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
                        ext4_error(sb, __func__,
-                                  "Checksum bad for group %lu\n", block_group);
+                                  "Checksum bad for group %u", block_group);
-                        gdp->bg_free_blocks_count = 0;
+                        ext4_free_blks_set(sb, gdp, 0);
-                        gdp->bg_free_inodes_count = 0;
+                        ext4_free_inodes_set(sb, gdp, 0);
-                        gdp->bg_itable_unused = 0;
+                        ext4_itable_unused_set(sb, gdp, 0);
                        memset(bh->b_data, 0xff, sb->s_blocksize);
                        return 0;
                }
@@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
                                             ext4_group_t block_group,
                                             struct buffer_head **bh)
 {
-        unsigned long group_desc;
+        unsigned int group_desc;
-        unsigned long offset;
+        unsigned int offset;
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        if (block_group >= sbi->s_groups_count) {
                ext4_error(sb, "ext4_get_group_desc",
                           "block_group >= groups_count - "
-                           "block_group = %lu, groups_count = %lu",
+                           "block_group = %u, groups_count = %u",
                           block_group, sbi->s_groups_count);
                return NULL;
@@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        if (!sbi->s_group_desc[group_desc]) {
                ext4_error(sb, "ext4_get_group_desc",
                           "Group descriptor not loaded - "
-                           "block_group = %lu, group_desc = %lu, desc = %lu",
+                           "block_group = %u, group_desc = %u, desc = %u",
                           block_group, group_desc, offset);
                return NULL;
        }
@@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
        if (unlikely(!bh)) {
                ext4_error(sb, __func__,
                            "Cannot read block bitmap - "
-                            "block_group = %lu, block_bitmap = %llu",
+                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
-        if (buffer_uptodate(bh) &&
-            !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+        if (bitmap_uptodate(bh))
                return bh;
        lock_buffer(bh);
+        if (bitmap_uptodate(bh)) {
+                unlock_buffer(bh);
+                return bh;
+        }
        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
+                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                unlock_buffer(bh);
                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                unlock_buffer(bh);
                return bh;
        }
        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        if (buffer_uptodate(bh)) {
+                /*
+                 * if not uninit if bh is uptodate,
+                 * bitmap is also uptodate
+                 */
+                set_bitmap_uptodate(bh);
+                unlock_buffer(bh);
+                return bh;
+        }
+        /*
+         * submit the buffer_head for read. We can
+         * safely mark the bitmap as uptodate now.
+         * We do it here so the bitmap uptodate bit
+         * get set with buffer lock held.
+         */
+        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
                ext4_error(sb, __func__,
                            "Cannot read block bitmap - "
-                            "block_group = %lu, block_bitmap = %llu",
+                            "block_group = %u, block_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
@@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 /**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
 * @handle:                     handle to this transaction
 * @sb:                         super block
- * @block:                      start physcial block to free
+ * @block:                      start physcial block to add to the block group
 * @count:                      number of blocks to free
- * @pdquot_freed_blocks:        pointer to quota
 *
- * XXX This function is only used by the on-line resizing code, which
+ * This marks the blocks as free in the bitmap. We ask the
- * should probably be fixed up to call the mballoc variant.  There
+ * mballoc to reload the buddy after this by setting group
- * this needs to be cleaned up later; in fact, I'm not convinced this
+ * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- * is 100% correct in the face of the mballoc code.  The online resizing
- * code needs to be fixed up to more tightly (and correctly) interlock
- * with the mballoc code.
 */
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-                         ext4_fsblk_t block, unsigned long count,
+                         ext4_fsblk_t block, unsigned long count)
-                         unsigned long *pdquot_freed_blocks)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
        ext4_grpblk_t bit;
-        unsigned long i;
+        unsigned int i;
-        unsigned long overflow;
        struct ext4_group_desc *desc;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
-        int err = 0, ret;
+        int err = 0, ret, blk_free_count;
-        ext4_grpblk_t group_freed;
+        ext4_grpblk_t blocks_freed;
+        struct ext4_group_info *grp;
-        *pdquot_freed_blocks = 0;
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
-        if (block < le32_to_cpu(es->s_first_data_block) ||
+        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-            block + count < block ||
-            block + count > ext4_blocks_count(es)) {
-                ext4_error(sb, "ext4_free_blocks",
-                           "Freeing blocks not in datazone - "
-                           "block = %llu, count = %lu", block, count);
-                goto error_return;
-        }
-        ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
-do_more:
-        overflow = 0;
        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+        grp = ext4_get_group_info(sb, block_group);
        /*
         * Check to see if we are freeing blocks across a group
         * boundary.
         */
        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-                overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+                goto error_return;
-                count -= overflow;
        }
-        brelse(bitmap_bh);
        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
        if (!bitmap_bh)
                goto error_return;
@@ -418,18 +422,17 @@ do_more:
            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
            in_range(block + count - 1, ext4_inode_table(sb, desc),
                     sbi->s_itb_per_group)) {
-                ext4_error(sb, "ext4_free_blocks",
+                ext4_error(sb, __func__,
-                           "Freeing blocks in system zones - "
+                           "Adding blocks in system zones - "
                           "Block = %llu, count = %lu",
                           block, count);
                goto error_return;
        }
        /*
-         * We are about to start releasing blocks in the bitmap,
+         * We are about to add blocks to the bitmap,
         * so we need undo access.
         */
-        /* @@@ check errors */
        BUFFER_TRACE(bitmap_bh, "getting undo access");
        err = ext4_journal_get_undo_access(handle, bitmap_bh);
        if (err)
@@ -444,107 +447,55 @@ do_more:
        err = ext4_journal_get_write_access(handle, gd_bh);
        if (err)
                goto error_return;
+        /*
-        jbd_lock_bh_state(bitmap_bh);
+         * make sure we don't allow a parallel init on other groups in the
+         * same buddy cache
-        for (i = 0, group_freed = 0; i < count; i++) {
+         */
-                /*
+        down_write(&grp->alloc_sem);
-                 * An HJ special.  This is expensive...
+        for (i = 0, blocks_freed = 0; i < count; i++) {
-                 */
-#ifdef CONFIG_JBD2_DEBUG
-                jbd_unlock_bh_state(bitmap_bh);
-                {
-                        struct buffer_head *debug_bh;
-                        debug_bh = sb_find_get_block(sb, block + i);
-                        if (debug_bh) {
-                                BUFFER_TRACE(debug_bh, "Deleted!");
-                                if (!bh2jh(bitmap_bh)->b_committed_data)
-                                        BUFFER_TRACE(debug_bh,
-                                                "No commited data in bitmap");
-                                BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
-                                __brelse(debug_bh);
-                        }
-                }
-                jbd_lock_bh_state(bitmap_bh);
-#endif
-                if (need_resched()) {
-                        jbd_unlock_bh_state(bitmap_bh);
-                        cond_resched();
-                        jbd_lock_bh_state(bitmap_bh);
-                }
-                /* @@@ This prevents newly-allocated data from being
-                 * freed and then reallocated within the same
-                 * transaction.
-                 *
-                 * Ideally we would want to allow that to happen, but to
-                 * do so requires making jbd2_journal_forget() capable of
-                 * revoking the queued write of a data block, which
-                 * implies blocking on the journal lock.  *forget()
-                 * cannot block due to truncate races.
-                 *
-                 * Eventually we can fix this by making jbd2_journal_forget()
-                 * return a status indicating whether or not it was able
-                 * to revoke the buffer.  On successful revoke, it is
-                 * safe not to set the allocation bit in the committed
-                 * bitmap, because we know that there is no outstanding
-                 * activity on the buffer any more and so it is safe to
-                 * reallocate it.
-                 */
-                BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
-                J_ASSERT_BH(bitmap_bh,
-                                bh2jh(bitmap_bh)->b_committed_data != NULL);
-                ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
-                                bh2jh(bitmap_bh)->b_committed_data);
-                /*
-                 * We clear the bit in the bitmap after setting the committed
-                 * data bit, because this is the reverse order to that which
-                 * the allocator uses.
-                 */
                BUFFER_TRACE(bitmap_bh, "clear bit");
                if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
                                                bit + i, bitmap_bh->b_data)) {
-                        jbd_unlock_bh_state(bitmap_bh);
                        ext4_error(sb, __func__,
                                   "bit already cleared for block %llu",
                                   (ext4_fsblk_t)(block + i));
-                        jbd_lock_bh_state(bitmap_bh);
                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
                } else {
-                        group_freed++;
+                        blocks_freed++;
                }
        }
-        jbd_unlock_bh_state(bitmap_bh);
        spin_lock(sb_bgl_lock(sbi, block_group));
-        le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+        ext4_free_blks_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
        spin_unlock(sb_bgl_lock(sbi, block_group));
-        percpu_counter_add(&sbi->s_freeblocks_counter, count);
+        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
                spin_lock(sb_bgl_lock(sbi, flex_group));
-                sbi->s_flex_groups[flex_group].free_blocks += count;
+                sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
+        /*
+         * request to reload the buddy with the
+         * new bitmap information
+         */
+        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+        ext4_mb_update_group_info(grp, blocks_freed);
+        up_write(&grp->alloc_sem);
        /* We dirtied the bitmap block */
        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        /* And the group descriptor block */
        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-        ret = ext4_journal_dirty_metadata(handle, gd_bh);
+        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-        if (!err) err = ret;
+        if (!err)
-        *pdquot_freed_blocks += group_freed;
+                err = ret;
-        if (overflow && !err) {
-                block += count;
-                count = overflow;
-                goto do_more;
-        }
        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, err);
@@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
                if (dirty_blocks < 0) {
                        printk(KERN_CRIT "Dirty block accounting "
                                        "went wrong %lld\n",
-                                        dirty_blocks);
+                                        (long long)dirty_blocks);
                }
        }
        /* Check whether we have space after
@@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
        return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
-#define EXT4_META_BLOCK 0x1
-static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, ext4_fsblk_t goal,
-                                unsigned long *count, int *errp, int flags)
-{
-        struct ext4_allocation_request ar;
-        ext4_fsblk_t ret;
-        memset(&ar, 0, sizeof(ar));
-        /* Fill with neighbour allocated blocks */
-        ar.inode = inode;
-        ar.goal = goal;
-        ar.len = *count;
-        ar.logical = iblock;
-        if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
-                /* enable in-core preallocation for data block allocation */
-                ar.flags = EXT4_MB_HINT_DATA;
-        else
-                /* disable in-core preallocation for non-regular files */
-                ar.flags = 0;
-        ret = ext4_mb_new_blocks(handle, &ar, errp);
-        *count = ar.len;
-        return ret;
-}
 /*
 * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
 *
 * @handle:             handle to this transaction
 * @inode:              file inode
 * @goal:               given target block(filesystem wide)
- * @count:              total number of blocks need
+ * @count:              pointer to total number of blocks needed
 * @errp:               error code
 *
- * Return 1st allocated block numberon success, *count stores total account
+ * Return 1st allocated block number on success, *count stores total account
 * error stores in errp pointer
 */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
+        struct ext4_allocation_request ar;
        ext4_fsblk_t ret;
-        ret = do_blk_alloc(handle, inode, 0, goal,
-                                count, errp, EXT4_META_BLOCK);
+        memset(&ar, 0, sizeof(ar));
+        /* Fill with neighbour allocated blocks */
+        ar.inode = inode;
+        ar.goal = goal;
+        ar.len = count ? *count : 1;
+        ret = ext4_mb_new_blocks(handle, &ar, errp);
+        if (count)
+                *count = ar.len;
        /*
         * Account for the allocated meta blocks
         */
        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-                EXT4_I(inode)->i_allocated_meta_blocks += *count;
+                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        }
        return ret;
 }
-/*
- * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @errp:               error code
- *
- * Return allocated block number on success
- */
-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-                ext4_fsblk_t goal, int *errp)
-{
-        unsigned long count = 1;
-        return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
-}
-/*
- * ext4_new_blocks() -- allocate data blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @count:              total number of blocks need
- * @errp:               error code
- *
- * Return 1st allocated block numberon success, *count stores total account
- * error stores in errp pointer
- */
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, ext4_fsblk_t goal,
-                                unsigned long *count, int *errp)
-{
-        return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
-}
 /**
 * ext4_count_free_blocks() -- count filesystem free blocks
 * @sb:         superblock
@@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
-        unsigned long x;
+        unsigned int x;
        struct buffer_head *bitmap_bh = NULL;
        es = EXT4_SB(sb)->s_es;
@@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                        continue;
                x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
+                printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
                        i, le16_to_cpu(gdp->bg_free_blocks_count), x);
                bitmap_count += x;
        }
@@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+                desc_count += ext4_free_blks_count(sb, gdp);
        }
        return desc_count;
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 0a7a6663c190..fa3af81ac565 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,10 +15,9 @@
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
+unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
 {
-        unsigned int i;
+        unsigned int i, sum = 0;
-        unsigned long sum = 0;
        if (!map)
                return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fed5b610df5a..2df2e40b01af 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 int ext4_check_dir_entry(const char *function, struct inode *dir,
                         struct ext4_dir_entry_2 *de,
                         struct buffer_head *bh,
-                         unsigned long offset)
+                         unsigned int offset)
 {
        const char *error_msg = NULL;
        const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
        if (error_msg != NULL)
                ext4_error(dir->i_sb, function,
                        "bad entry in directory #%lu: %s - "
-                        "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+                        "offset=%u, inode=%u, rec_len=%d, name_len=%d",
                        dir->i_ino, error_msg, offset,
-                        (unsigned long) le32_to_cpu(de->inode),
+                        le32_to_cpu(de->inode),
                        rlen, de->name_len);
        return error_msg == NULL ? 1 : 0;
 }
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
                         void *dirent, filldir_t filldir)
 {
        int error = 0;
-        unsigned long offset;
+        unsigned int offset;
        int i, stored;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
        sb = inode->i_sb;
        if (!fname) {
-                printk(KERN_ERR "ext4: call_filldir: called with "
+                printk(KERN_ERR "EXT4-fs: call_filldir: called with "
                       "null fname?!?\n");
                return 0;
        }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0537c827024..c668e4377d76 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -19,6 +19,7 @@
 #include <linux/types.h>
 #include <linux/blkdev.h>
 #include <linux/magic.h>
+#include <linux/jbd2.h>
 #include "ext4_i.h"
 /*
@@ -94,9 +95,9 @@ struct ext4_allocation_request {
        /* phys. block for ^^^ */
        ext4_fsblk_t pright;
        /* how many blocks we want to allocate */
-        unsigned long len;
+        unsigned int len;
        /* flags. see above EXT4_MB_HINT_* */
-        unsigned long flags;
+        unsigned int flags;
 };
 /*
@@ -156,12 +157,12 @@ struct ext4_group_desc
        __le32  bg_block_bitmap_lo;     /* Blocks bitmap block */
        __le32  bg_inode_bitmap_lo;     /* Inodes bitmap block */
        __le32  bg_inode_table_lo;      /* Inodes table block */
-        __le16  bg_free_blocks_count;   /* Free blocks count */
+        __le16  bg_free_blocks_count_lo;/* Free blocks count */
-        __le16  bg_free_inodes_count;   /* Free inodes count */
+        __le16  bg_free_inodes_count_lo;/* Free inodes count */
-        __le16  bg_used_dirs_count;     /* Directories count */
+        __le16  bg_used_dirs_count_lo;  /* Directories count */
        __le16  bg_flags;               /* EXT4_BG_flags (INODE_UNINIT, etc) */
        __u32   bg_reserved[2];         /* Likely block/inode bitmap checksum */
-        __le16  bg_itable_unused;       /* Unused inodes count */
+        __le16  bg_itable_unused_lo;    /* Unused inodes count */
        __le16  bg_checksum;            /* crc16(sb_uuid+group+desc) */
        __le32  bg_block_bitmap_hi;     /* Blocks bitmap block MSB */
        __le32  bg_inode_bitmap_hi;     /* Inodes bitmap block MSB */
@@ -169,7 +170,7 @@ struct ext4_group_desc
        __le16  bg_free_blocks_count_hi;/* Free blocks count MSB */
        __le16  bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16  bg_used_dirs_count_hi;  /* Directories count MSB */
-        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
+        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
        __u32   bg_reserved2[3];
 };
@@ -328,6 +329,7 @@ struct ext4_mount_options {
        uid_t s_resuid;
        gid_t s_resgid;
        unsigned long s_commit_interval;
+        u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
        int s_jquota_fmt;
        char *s_qf_names[MAXQUOTAS];
@@ -534,7 +536,6 @@ do {									       \
 #define EXT4_MOUNT_QUOTA                0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA             0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA             0x200000 /* "old" group quota */
-#define EXT4_MOUNT_EXTENTS              0x400000 /* Extents support */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
@@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 */
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)                        \
-        (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
+        ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)                     \
-        (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
+        ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)                      \
-        (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
+        ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)                        \
        EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)                     \
@@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEFM_JMODE_WBACK   0x0060
 /*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME 0
+#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+/*
 * Structure of a directory entry
 */
 #define EXT4_NAME_LEN 255
@@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
 #define DX_HASH_LEGACY          0
 #define DX_HASH_HALF_MD4        1
 #define DX_HASH_TEA             2
+#define DX_HASH_LEGACY_UNSIGNED 3
+#define DX_HASH_HALF_MD4_UNSIGNED       4
+#define DX_HASH_TEA_UNSIGNED            5
 #ifdef __KERNEL__
@@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 #define ERR_BAD_DX_DIR  -75000
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-                        unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+                        ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
 extern struct proc_dir_entry *ext4_proc_root;
@@ -987,6 +997,9 @@ do {									\
 # define ATTRIB_NORET   __attribute__((noreturn))
 # define NORET_AND      noreturn,
+/* bitmap.c */
+extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
 /* balloc.c */
 extern unsigned int ext4_block_group(struct super_block *sb,
                        ext4_fsblk_t blocknr);
@@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
-extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-                        ext4_fsblk_t goal, int *errp);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-                                        ext4_lblk_t iblock, ext4_fsblk_t goal,
-                                        unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
-                                ext4_fsblk_t block, unsigned long count,
+                                ext4_fsblk_t block, unsigned long count);
-                                unsigned long *pdquot_freed_blocks);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
                                struct ext4_dir_entry_2 *,
-                                struct buffer_head *, unsigned long);
+                                struct buffer_head *, unsigned int);
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                    __u32 minor_hash,
                                    struct ext4_dir_entry_2 *dirent);
@@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
                unsigned long, unsigned long, int, unsigned long *);
-extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
                ext4_grpblk_t add);
+extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
+                                                ext4_group_t, int);
 /* inode.c */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                struct buffer_head *bh, ext4_fsblk_t blocknr);
@@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
                                                ext4_lblk_t, int, int *);
 int ext4_get_block(struct inode *inode, sector_t iblock,
                                struct buffer_head *bh_result, int create);
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-                                ext4_lblk_t iblock, unsigned long maxblocks,
-                                struct buffer_head *bh_result,
-                                int create, int extend_disksize);
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, int);
@@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
+                                const char *, const char *, ...)
+        __attribute__ ((format (printf, 4, 5)));
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
                                        __u32 compat);
@@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
+extern __u32 ext4_free_blks_count(struct super_block *sb,
+                                struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+                                 struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+                                struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+                                   struct ext4_group_desc *bg);
 extern void ext4_block_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_blks_set(struct super_block *sb,
+                               struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+                                struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+                                struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+                                   struct ext4_group_desc *bg, __u32 count);
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -1225,11 +1247,11 @@ do {								\
 } while (0)
 #ifdef CONFIG_SMP
-/* Each CPU can accumulate FBC_BATCH blocks in their local
+/* Each CPU can accumulate percpu_counter_batch blocks in their local
 * counters. So we need to make sure we have free blocks more
- * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
 #define EXT4_FREEBLOCKS_WATERMARK 0
 #endif
@@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
        return ;
 }
+struct ext4_group_info {
+        unsigned long   bb_state;
+        struct rb_root  bb_free_root;
+        unsigned short  bb_first_free;
+        unsigned short  bb_free;
+        unsigned short  bb_fragments;
+        struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+        void            *bb_bitmap;
+#endif
+        struct rw_semaphore alloc_sem;
+        unsigned short  bb_counters[];
+};
+#define EXT4_GROUP_INFO_NEED_INIT_BIT   0
+#define EXT4_GROUP_INFO_LOCKED_BIT      1
+#define EXT4_MB_GRP_NEED_INIT(grp)      \
+        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+static inline void ext4_unlock_group(struct super_block *sb,
+                                        ext4_group_t group)
+{
+        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+static inline int ext4_is_group_locked(struct super_block *sb,
+                                        ext4_group_t group)
+{
+        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+                                                &(grinfo->bb_state));
+}
 /*
 * Inodes and files operations
 */
@@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock,
+                               ext4_lblk_t iblock, unsigned int max_blocks,
-                        unsigned long max_blocks, struct buffer_head *bh_result,
+                               struct buffer_head *bh_result,
-                        int create, int extend_disksize);
+                               int create, int extend_disksize);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
-                        sector_t block, unsigned long max_blocks,
+                        sector_t block, unsigned int max_blocks,
                        struct buffer_head *bh, int create,
                        int extend_disksize, int flag);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                        __u64 start, __u64 len);
+/*
+ * Add new method to test wether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+        return (buffer_uptodate(bh) &&
+                        test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
 #endif  /* __KERNEL__ */
 #endif  /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bec7ce59fc0d..18cb67b2cbbc 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
        return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
 }
-static inline void ext4_ext_tree_changed(struct inode *inode)
-{
-        EXT4_I(inode)->i_ext_generation++;
-}
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 5c124c0ac6d3..e69acc16f5c4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
 typedef __u32 ext4_lblk_t;
 /* data type for block group number */
-typedef unsigned long ext4_group_t;
+typedef unsigned int ext4_group_t;
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
@@ -100,9 +100,6 @@ struct ext4_inode_info {
         */
        loff_t  i_disksize;
-        /* on-disk additional length */
-        __u16 i_extra_isize;
        /*
         * i_data_sem is for serialising ext4_truncate() against
         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
@@ -117,7 +114,6 @@ struct ext4_inode_info {
        struct inode vfs_inode;
        struct jbd2_inode jinode;
-        unsigned long i_ext_generation;
        struct ext4_ext_cache i_cached_extent;
        /*
         * File creation time. Its function is same as that of
@@ -130,10 +126,14 @@ struct ext4_inode_info {
        spinlock_t i_prealloc_lock;
        /* allocation reservation info for delalloc */
-        unsigned long i_reserved_data_blocks;
+        unsigned int i_reserved_data_blocks;
-        unsigned long i_reserved_meta_blocks;
+        unsigned int i_reserved_meta_blocks;
-        unsigned long i_allocated_meta_blocks;
+        unsigned int i_allocated_meta_blocks;
        unsigned short i_delalloc_reserved_flag;
+        /* on-disk additional length */
+        __u16 i_extra_isize;
        spinlock_t i_block_reservation_lock;
 };
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index c75384b34f2c..ad13a84644e1 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,53 +7,96 @@
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
-        int err = jbd2_journal_get_undo_access(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_get_undo_access(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
-        int err = jbd2_journal_get_write_access(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_get_write_access(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
 int __ext4_journal_forget(const char *where, handle_t *handle,
                                struct buffer_head *bh)
 {
-        int err = jbd2_journal_forget(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_forget(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
 int __ext4_journal_revoke(const char *where, handle_t *handle,
                                ext4_fsblk_t blocknr, struct buffer_head *bh)
 {
-        int err = jbd2_journal_revoke(handle, blocknr, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_revoke(handle, blocknr, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
 int __ext4_journal_get_create_access(const char *where,
                                handle_t *handle, struct buffer_head *bh)
 {
-        int err = jbd2_journal_get_create_access(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_get_create_access(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        }
        return err;
 }
-int __ext4_journal_dirty_metadata(const char *where,
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
-                                handle_t *handle, struct buffer_head *bh)
+                                 struct inode *inode, struct buffer_head *bh)
 {
-        int err = jbd2_journal_dirty_metadata(handle, bh);
+        int err = 0;
-        if (err)
-                ext4_journal_abort_handle(where, __func__, bh, handle, err);
+        if (ext4_handle_valid(handle)) {
+                err = jbd2_journal_dirty_metadata(handle, bh);
+                if (err)
+                        ext4_journal_abort_handle(where, __func__, bh,
+                                                  handle, err);
+        } else {
+                mark_buffer_dirty(bh);
+                if (inode && inode_needs_sync(inode)) {
+                        sync_dirty_buffer(bh);
+                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                                ext4_error(inode->i_sb, __func__,
+                                           "IO error syncing inode, "
+                                           "inode=%lu, block=%llu",
+                                           inode->i_ino,
+                                           (unsigned long long) bh->b_blocknr);
+                                err = -EIO;
+                        }
+                }
+        }
        return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b455c685a98b..be2f426f6805 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -32,8 +32,8 @@
 * 5 levels of tree + root which are stored in the inode. */
 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                                \
-        (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
+        (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
-                || test_opt(sb, EXTENTS) ? 27U : 8U)
+         ? 27U : 8U)
 /* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
 * been done yet.
 */
-static inline void ext4_journal_release_buffer(handle_t *handle,
-                                                struct buffer_head *bh)
-{
-        jbd2_journal_release_buffer(handle, bh);
-}
 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
                struct buffer_head *bh, handle_t *handle, int err);
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
 int __ext4_journal_get_create_access(const char *where,
                                handle_t *handle, struct buffer_head *bh);
-int __ext4_journal_dirty_metadata(const char *where,
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
-                                handle_t *handle, struct buffer_head *bh);
+                                 struct inode *inode, struct buffer_head *bh);
 #define ext4_journal_get_undo_access(handle, bh) \
        __ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
        __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
        __ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_dirty_metadata(handle, bh) \
-        __ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
        __ext4_journal_forget(__func__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+        __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
+#define EXT4_NOJOURNAL_HANDLE   ((handle_t *) 0x1)
+static inline int ext4_handle_valid(handle_t *handle)
+{
+        if (handle == EXT4_NOJOURNAL_HANDLE)
+                return 0;
+        return 1;
+}
+static inline void ext4_handle_sync(handle_t *handle)
+{
+        if (ext4_handle_valid(handle))
+                handle->h_sync = 1;
+}
+static inline void ext4_handle_release_buffer(handle_t *handle,
+                                                struct buffer_head *bh)
+{
+        if (ext4_handle_valid(handle))
+                jbd2_journal_release_buffer(handle, bh);
+}
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+        if (ext4_handle_valid(handle))
+                return is_handle_aborted(handle);
+        return 0;
+}
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+        if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+                return 0;
+        return 1;
+}
+static inline void ext4_journal_release_buffer(handle_t *handle,
+                                                struct buffer_head *bh)
+{
+        if (ext4_handle_valid(handle))
+                jbd2_journal_release_buffer(handle, bh);
+}
 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 {
        return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
 static inline int ext4_journal_extend(handle_t *handle, int nblocks)
 {
-        return jbd2_journal_extend(handle, nblocks);
+        if (ext4_handle_valid(handle))
+                return jbd2_journal_extend(handle, nblocks);
+        return 0;
 }
 static inline int ext4_journal_restart(handle_t *handle, int nblocks)
 {
-        return jbd2_journal_restart(handle, nblocks);
+        if (ext4_handle_valid(handle))
+                return jbd2_journal_restart(handle, nblocks);
+        return 0;
 }
 static inline int ext4_journal_blocks_per_page(struct inode *inode)
 {
-        return jbd2_journal_blocks_per_page(inode);
+        if (EXT4_JOURNAL(inode) != NULL)
+                return jbd2_journal_blocks_per_page(inode);
+        return 0;
 }
 static inline int ext4_journal_force_commit(journal_t *journal)
 {
-        return jbd2_journal_force_commit(journal);
+        if (journal)
+                return jbd2_journal_force_commit(journal);
+        return 0;
 }
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-        return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+        if (ext4_handle_valid(handle))
+                return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+        return 0;
 }
 /* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
 static inline int ext4_should_journal_data(struct inode *inode)
 {
+        if (EXT4_JOURNAL(inode) == NULL)
+                return 0;
        if (!S_ISREG(inode->i_mode))
                return 1;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
 static inline int ext4_should_order_data(struct inode *inode)
 {
+        if (EXT4_JOURNAL(inode) == NULL)
+                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
+        if (EXT4_JOURNAL(inode) == NULL)
+                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 445fde603df8..039b6ea1a042 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -57,6 +57,7 @@ struct ext4_sb_info {
        u32 s_next_generation;
        u32 s_hash_seed[4];
        int s_def_hash_version;
+        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
        struct percpu_counter s_freeblocks_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
@@ -73,6 +74,8 @@ struct ext4_sb_info {
        struct journal_s *s_journal;
        struct list_head s_orphan;
        unsigned long s_commit_interval;
+        u32 s_max_batch_time;
+        u32 s_min_batch_time;
        struct block_device *journal_bdev;
 #ifdef CONFIG_JBD2_DEBUG
        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
@@ -101,7 +104,8 @@ struct ext4_sb_info {
        spinlock_t s_reserve_lock;
        spinlock_t s_md_lock;
        tid_t s_last_transaction;
-        unsigned short *s_mb_offsets, *s_mb_maxs;
+        unsigned short *s_mb_offsets;
+        unsigned int *s_mb_maxs;
        /* tunables */
        unsigned long s_stripe;
@@ -146,4 +150,10 @@ struct ext4_sb_info {
        struct flex_groups *s_flex_groups;
 };
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+        return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+}
 #endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae66..54bf0623a9ae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
        int err;
+        if (!ext4_handle_valid(handle))
+                return 0;
        if (handle->h_buffer_credits > needed)
                return 0;
        err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
        int err;
        if (path->p_bh) {
                /* path points to block */
-                err = ext4_journal_dirty_metadata(handle, path->p_bh);
+                err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
        } else {
                /* path points to leaf/index in inode body */
                err = ext4_mark_inode_dirty(handle, inode);
@@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
        ext4_fsblk_t goal, newblock;
        goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-        newblock = ext4_new_meta_block(handle, inode, goal, err);
+        newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
        return newblock;
 }
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
-        err = ext4_journal_dirty_metadata(handle, bh);
+        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto cleanup;
        brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                err = ext4_journal_dirty_metadata(handle, bh);
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (err)
                        goto cleanup;
                brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
-        err = ext4_journal_dirty_metadata(handle, bh);
+        err = ext4_handle_dirty_metadata(handle, inode, bh);
        if (err)
                goto out;
@@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        while (--depth >= 0) {
                ix = path[depth].p_idx;
                if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
-                        break;
+                        goto got_index;
        }
-        if (depth < 0) {
+        /* we've gone up to the root and found no index to the right */
-                /* we've gone up to the root and
+        return 0;
-                 * found no index to the right */
-                return 0;
-        }
+got_index:
        /* we've found index to the right, let's
         * follow it and find the closest allocated
         * block to the right */
@@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
        *phys = ext_pblock(ex);
        put_bh(bh);
        return 0;
 }
 /*
@@ -1622,7 +1621,6 @@ cleanup:
                ext4_ext_drop_refs(npath);
                kfree(npath);
        }
-        ext4_ext_tree_changed(inode);
        ext4_ext_invalidate_cache(inode);
        return err;
 }
@@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                }
        }
 out:
-        ext4_ext_tree_changed(inode);
        ext4_ext_drop_refs(path);
        kfree(path);
        ext4_journal_stop(handle);
@@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
         * possible initialization would be here
         */
-        if (test_opt(sb, EXTENTS)) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
                printk(", aggressive tests");
@@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
 */
 void ext4_ext_release(struct super_block *sb)
 {
-        if (!test_opt(sb, EXTENTS))
+        if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
                return;
 #ifdef EXTENTS_STATS
@@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                                struct inode *inode,
                                                struct ext4_ext_path *path,
                                                ext4_lblk_t iblock,
-                                                unsigned long max_blocks)
+                                                unsigned int max_blocks)
 {
        struct ext4_extent *ex, newex, orig_ex;
        struct ext4_extent *ex1 = NULL;
@@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 */
                newdepth = ext_depth(inode);
                /*
-                 * update the extent length after successfull insert of the
+                 * update the extent length after successful insert of the
                 * split extent
                 */
                orig_ex.ee_len = cpu_to_le16(ee_len -
@@ -2678,26 +2675,26 @@ fix_extent_len:
 */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
-                        unsigned long max_blocks, struct buffer_head *bh_result,
+                        unsigned int max_blocks, struct buffer_head *bh_result,
                        int create, int extend_disksize)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex;
-        ext4_fsblk_t goal, newblock;
+        ext4_fsblk_t newblock;
-        int err = 0, depth, ret;
+        int err = 0, depth, ret, cache_type;
-        unsigned long allocated = 0;
+        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        loff_t disksize;
        __clear_bit(BH_New, &bh_result->b_state);
-        ext_debug("blocks %u/%lu requested for inode %u\n",
+        ext_debug("blocks %u/%u requested for inode %u\n",
                        iblock, max_blocks, inode->i_ino);
        /* check in cache */
-        goal = ext4_ext_in_cache(inode, iblock, &newex);
+        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
-        if (goal) {
+        if (cache_type) {
-                if (goal == EXT4_EXT_CACHE_GAP) {
+                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if (!create) {
                                /*
                                 * block isn't allocated yet and
@@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                goto out2;
                        }
                        /* we should allocate requested block */
-                } else if (goal == EXT4_EXT_CACHE_EXTENT) {
+                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
                        /* block is already allocated */
                        newblock = iblock
                                   - le32_to_cpu(newex.ee_block)
@@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        if (!newblock)
                goto out2;
        ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
-                        goal, newblock, allocated);
+                  ar.goal, newblock, allocated);
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
@@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode)
         * transaction synchronous.
         */
        if (IS_SYNC(inode))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
 out_stop:
        up_write(&EXT4_I(inode)->i_data_sem);
@@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
        handle_t *handle;
        ext4_lblk_t block;
        loff_t new_size;
-        unsigned long max_blocks;
+        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
@@ -3083,7 +3080,7 @@ retry:
 /*
 * Callback function called for each extent to gather FIEMAP information.
 */
-int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                       struct ext4_ext_cache *newex, struct ext4_extent *ex,
                       void *data)
 {
@@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+static int ext4_xattr_fiemap(struct inode *inode,
+                                struct fiemap_extent_info *fieinfo)
 {
        __u64 physical = 0;
        __u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fba71f7..f731cb545a03 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-                __u64 start, __u64 len);
 const struct file_operations ext4_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 556ca8eba3db..ac8f168c8ab4 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 /* The old legacy hash */
-static __u32 dx_hack_hash(const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-        __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        const unsigned char *ucp = (const unsigned char *) name;
+        while (len--) {
+                hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+                if (hash & 0x80000000)
+                        hash -= 0x7fffffff;
+                hash1 = hash0;
+                hash0 = hash;
+        }
+        return hash0 << 1;
+}
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+        __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+        const signed char *scp = (const signed char *) name;
        while (len--) {
-                __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+                hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
-                if (hash & 0x80000000) hash -= 0x7fffffff;
+                if (hash & 0x80000000)
+                        hash -= 0x7fffffff;
                hash1 = hash0;
                hash0 = hash;
        }
-        return (hash0 << 1);
+        return hash0 << 1;
+}
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+        __u32   pad, val;
+        int     i;
+        const signed char *scp = (const signed char *) msg;
+        pad = (__u32)len | ((__u32)len << 8);
+        pad |= pad << 16;
+        val = pad;
+        if (len > num*4)
+                len = num * 4;
+        for (i = 0; i < len; i++) {
+                if ((i % 4) == 0)
+                        val = pad;
+                val = ((int) scp[i]) + (val << 8);
+                if ((i % 4) == 3) {
+                        *buf++ = val;
+                        val = pad;
+                        num--;
+                }
+        }
+        if (--num >= 0)
+                *buf++ = val;
+        while (--num >= 0)
+                *buf++ = pad;
 }
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 {
        __u32   pad, val;
        int     i;
+        const unsigned char *ucp = (const unsigned char *) msg;
        pad = (__u32)len | ((__u32)len << 8);
        pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
        for (i = 0; i < len; i++) {
                if ((i % 4) == 0)
                        val = pad;
-                val = msg[i] + (val << 8);
+                val = ((int) ucp[i]) + (val << 8);
                if ((i % 4) == 3) {
                        *buf++ = val;
                        val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        const char      *p;
        int             i;
        __u32           in[8], buf[4];
+        void            (*str2hashbuf)(const char *, int, __u32 *, int) =
+                                str2hashbuf_signed;
        /* Initialize the default seed for the hash checksum functions */
        buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
        }
        switch (hinfo->hash_version) {
+        case DX_HASH_LEGACY_UNSIGNED:
+                hash = dx_hack_hash_unsigned(name, len);
+                break;
        case DX_HASH_LEGACY:
-                hash = dx_hack_hash(name, len);
+                hash = dx_hack_hash_signed(name, len);
                break;
+        case DX_HASH_HALF_MD4_UNSIGNED:
+                str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_HALF_MD4:
                p = name;
                while (len > 0) {
-                        str2hashbuf(p, len, in, 8);
+                        (*str2hashbuf)(p, len, in, 8);
                        half_md4_transform(buf, in);
                        len -= 32;
                        p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
                minor_hash = buf[2];
                hash = buf[1];
                break;
+        case DX_HASH_TEA_UNSIGNED:
+                str2hashbuf = str2hashbuf_unsigned;
        case DX_HASH_TEA:
                p = name;
                while (len > 0) {
-                        str2hashbuf(p, len, in, 4);
+                        (*str2hashbuf)(p, len, in, 4);
                        TEA_transform(buf, in);
                        len -= 16;
                        p += 16;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 08cac9fcace2..4fb86a0061d0 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
        /* If checksum is bad mark all blocks and inodes use to prevent
         * allocation, essentially implementing a per-group read-only flag. */
        if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-                ext4_error(sb, __func__, "Checksum bad for group %lu\n",
+                ext4_error(sb, __func__, "Checksum bad for group %u",
                           block_group);
-                gdp->bg_free_blocks_count = 0;
+                ext4_free_blks_set(sb, gdp, 0);
-                gdp->bg_free_inodes_count = 0;
+                ext4_free_inodes_set(sb, gdp, 0);
-                gdp->bg_itable_unused = 0;
+                ext4_itable_unused_set(sb, gdp, 0);
                memset(bh->b_data, 0xff, sb->s_blocksize);
                return 0;
        }
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
        return EXT4_INODES_PER_GROUP(sb);
@@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
        if (unlikely(!bh)) {
                ext4_error(sb, __func__,
                            "Cannot read inode bitmap - "
-                            "block_group = %lu, inode_bitmap = %llu",
+                            "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
-        if (buffer_uptodate(bh) &&
+        if (bitmap_uptodate(bh))
-            !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
                return bh;
        lock_buffer(bh);
+        if (bitmap_uptodate(bh)) {
+                unlock_buffer(bh);
+                return bh;
+        }
        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
+                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                unlock_buffer(bh);
                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                unlock_buffer(bh);
                return bh;
        }
        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        if (buffer_uptodate(bh)) {
+                /*
+                 * if not uninit if bh is uptodate,
+                 * bitmap is also uptodate
+                 */
+                set_bitmap_uptodate(bh);
+                unlock_buffer(bh);
+                return bh;
+        }
+        /*
+         * submit the buffer_head for read. We can
+         * safely mark the bitmap as uptodate now.
+         * We do it here so the bitmap uptodate bit
+         * get set with buffer lock held.
+         */
+        set_bitmap_uptodate(bh);
        if (bh_submit_read(bh) < 0) {
                put_bh(bh);
                ext4_error(sb, __func__,
                            "Cannot read inode bitmap - "
-                            "block_group = %lu, inode_bitmap = %llu",
+                            "block_group = %u, inode_bitmap = %llu",
                            block_group, bitmap_blk);
                return NULL;
        }
@@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
-        int fatal = 0, err;
+        int fatal = 0, err, count;
        ext4_group_t flex_group;
        if (atomic_read(&inode->i_count) > 1) {
@@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        ino = inode->i_ino;
        ext4_debug("freeing inode %lu\n", ino);
+        trace_mark(ext4_free_inode,
+                   "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
+                   sb->s_id, inode->i_ino, inode->i_mode,
+                   (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
+                   (unsigned long long) inode->i_blocks);
        /*
         * Note: we must free any quota before locking the superblock,
@@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                if (gdp) {
                        spin_lock(sb_bgl_lock(sbi, block_group));
-                        le16_add_cpu(&gdp->bg_free_inodes_count, 1);
+                        count = ext4_free_inodes_count(sb, gdp) + 1;
-                        if (is_directory)
+                        ext4_free_inodes_set(sb, gdp, count);
-                                le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+                        if (is_directory) {
+                                count = ext4_used_dirs_count(sb, gdp) - 1;
+                                ext4_used_dirs_set(sb, gdp, count);
+                        }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
                        spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                                spin_unlock(sb_bgl_lock(sbi, flex_group));
                        }
                }
-                BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-                err = ext4_journal_dirty_metadata(handle, bh2);
+                err = ext4_handle_dirty_metadata(handle, NULL, bh2);
                if (!fatal) fatal = err;
        }
-        BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
+        BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
-        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        if (!fatal)
                fatal = err;
        sb->s_dirt = 1;
@@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
        for (group = 0; group < ngroups; group++) {
                desc = ext4_get_group_desc(sb, group, NULL);
-                if (!desc || !desc->bg_free_inodes_count)
+                if (!desc || !ext4_free_inodes_count(sb, desc))
                        continue;
-                if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+                if (ext4_free_inodes_count(sb, desc) < avefreei)
                        continue;
                if (!best_desc ||
-                    (le16_to_cpu(desc->bg_free_blocks_count) >
+                    (ext4_free_blks_count(sb, desc) >
-                     le16_to_cpu(best_desc->bg_free_blocks_count))) {
+                     ext4_free_blks_count(sb, best_desc))) {
                        *best_group = group;
                        best_desc = desc;
                        ret = 0;
@@ -369,7 +397,7 @@ found_flexbg:
        for (i = best_flex * flex_size; i < ngroups &&
                     i < (best_flex + 1) * flex_size; i++) {
                desc = ext4_get_group_desc(sb, i, &bh);
-                if (le16_to_cpu(desc->bg_free_inodes_count)) {
+                if (ext4_free_inodes_count(sb, desc)) {
                        *best_group = i;
                        goto out;
                }
@@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                for (i = 0; i < ngroups; i++) {
                        grp = (parent_group + i) % ngroups;
                        desc = ext4_get_group_desc(sb, grp, NULL);
-                        if (!desc || !desc->bg_free_inodes_count)
+                        if (!desc || !ext4_free_inodes_count(sb, desc))
                                continue;
-                        if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+                        if (ext4_used_dirs_count(sb, desc) >= best_ndir)
                                continue;
-                        if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+                        if (ext4_free_inodes_count(sb, desc) < avefreei)
                                continue;
-                        if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+                        if (ext4_free_blks_count(sb, desc) < avefreeb)
                                continue;
                        *group = grp;
                        ret = 0;
-                        best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+                        best_ndir = ext4_used_dirs_count(sb, desc);
                }
                if (ret == 0)
                        return ret;
@@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        for (i = 0; i < ngroups; i++) {
                *group = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
-                if (!desc || !desc->bg_free_inodes_count)
+                if (!desc || !ext4_free_inodes_count(sb, desc))
                        continue;
-                if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+                if (ext4_used_dirs_count(sb, desc) >= max_dirs)
                        continue;
-                if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+                if (ext4_free_inodes_count(sb, desc) < min_inodes)
                        continue;
-                if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+                if (ext4_free_blks_count(sb, desc) < min_blocks)
                        continue;
                return 0;
        }
@@ -494,8 +522,8 @@ fallback:
        for (i = 0; i < ngroups; i++) {
                *group = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
-                if (desc && desc->bg_free_inodes_count &&
+                if (desc && ext4_free_inodes_count(sb, desc) &&
-                        le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+                        ext4_free_inodes_count(sb, desc) >= avefreei)
                        return 0;
        }
@@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
         */
        *group = parent_group;
        desc = ext4_get_group_desc(sb, *group, NULL);
-        if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+        if (desc && ext4_free_inodes_count(sb, desc) &&
-                        le16_to_cpu(desc->bg_free_blocks_count))
+                        ext4_free_blks_count(sb, desc))
                return 0;
        /*
@@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                if (*group >= ngroups)
                        *group -= ngroups;
                desc = ext4_get_group_desc(sb, *group, NULL);
-                if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+                if (desc && ext4_free_inodes_count(sb, desc) &&
-                                le16_to_cpu(desc->bg_free_blocks_count))
+                                ext4_free_blks_count(sb, desc))
                        return 0;
        }
@@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                if (++*group >= ngroups)
                        *group = 0;
                desc = ext4_get_group_desc(sb, *group, NULL);
-                if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+                if (desc && ext4_free_inodes_count(sb, desc))
                        return 0;
        }
@@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 }
 /*
+ * claim the inode from the inode bitmap. If the group
+ * is uninit we need to take the groups's sb_bgl_lock
+ * and clear the uninit flag. The inode bitmap update
+ * and group desc uninit flag clear should be done
+ * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * doesn't race with the ext4_claim_inode
+ */
+static int ext4_claim_inode(struct super_block *sb,
+                        struct buffer_head *inode_bitmap_bh,
+                        unsigned long ino, ext4_group_t group, int mode)
+{
+        int free = 0, retval = 0, count;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+        spin_lock(sb_bgl_lock(sbi, group));
+        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+                /* not a free inode */
+                retval = 1;
+                goto err_ret;
+        }
+        ino++;
+        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+                        ino > EXT4_INODES_PER_GROUP(sb)) {
+                spin_unlock(sb_bgl_lock(sbi, group));
+                ext4_error(sb, __func__,
+                           "reserved inode or inode > inodes count - "
+                           "block_group = %u, inode=%lu", group,
+                           ino + group * EXT4_INODES_PER_GROUP(sb));
+                return 1;
+        }
+        /* If we didn't allocate from within the initialized part of the inode
+         * table then we need to initialize up to this inode. */
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+                        /* When marking the block group with
+                         * ~EXT4_BG_INODE_UNINIT we don't want to depend
+                         * on the value of bg_itable_unused even though
+                         * mke2fs could have initialized the same for us.
+                         * Instead we calculated the value below
+                         */
+                        free = 0;
+                } else {
+                        free = EXT4_INODES_PER_GROUP(sb) -
+                                ext4_itable_unused_count(sb, gdp);
+                }
+                /*
+                 * Check the relative inode number against the last used
+                 * relative inode number in this group. if it is greater
+                 * we need to  update the bg_itable_unused count
+                 *
+                 */
+                if (ino > free)
+                        ext4_itable_unused_set(sb, gdp,
+                                        (EXT4_INODES_PER_GROUP(sb) - ino));
+        }
+        count = ext4_free_inodes_count(sb, gdp) - 1;
+        ext4_free_inodes_set(sb, gdp, count);
+        if (S_ISDIR(mode)) {
+                count = ext4_used_dirs_count(sb, gdp) + 1;
+                ext4_used_dirs_set(sb, gdp, count);
+        }
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+        spin_unlock(sb_bgl_lock(sbi, group));
+        return retval;
+}
+/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
 * free space and a low directory-to-inode ratio; if that fails, then of
@@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 {
        struct super_block *sb;
-        struct buffer_head *bitmap_bh = NULL;
+        struct buffer_head *inode_bitmap_bh = NULL;
-        struct buffer_head *bh2;
+        struct buffer_head *group_desc_bh;
        ext4_group_t group = 0;
        unsigned long ino = 0;
        struct inode *inode;
@@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                return ERR_PTR(-EPERM);
        sb = dir->i_sb;
+        trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
+                   dir->i_ino, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
@@ -631,40 +734,52 @@ got_group:
        for (i = 0; i < sbi->s_groups_count; i++) {
                err = -EIO;
-                gdp = ext4_get_group_desc(sb, group, &bh2);
+                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
                if (!gdp)
                        goto fail;
-                brelse(bitmap_bh);
+                brelse(inode_bitmap_bh);
-                bitmap_bh = ext4_read_inode_bitmap(sb, group);
+                inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
-                if (!bitmap_bh)
+                if (!inode_bitmap_bh)
                        goto fail;
                ino = 0;
 repeat_in_this_group:
                ino = ext4_find_next_zero_bit((unsigned long *)
-                                bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+                                              inode_bitmap_bh->b_data,
+                                              EXT4_INODES_PER_GROUP(sb), ino);
                if (ino < EXT4_INODES_PER_GROUP(sb)) {
-                        BUFFER_TRACE(bitmap_bh, "get_write_access");
+                        BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
-                        err = ext4_journal_get_write_access(handle, bitmap_bh);
+                        err = ext4_journal_get_write_access(handle,
+                                                            inode_bitmap_bh);
                        if (err)
                                goto fail;
-                        if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
+                        BUFFER_TRACE(group_desc_bh, "get_write_access");
-                                                ino, bitmap_bh->b_data)) {
+                        err = ext4_journal_get_write_access(handle,
+                                                                group_desc_bh);
+                        if (err)
+                                goto fail;
+                        if (!ext4_claim_inode(sb, inode_bitmap_bh,
+                                                ino, group, mode)) {
                                /* we won it */
-                                BUFFER_TRACE(bitmap_bh,
+                                BUFFER_TRACE(inode_bitmap_bh,
-                                        "call ext4_journal_dirty_metadata");
+                                        "call ext4_handle_dirty_metadata");
-                                err = ext4_journal_dirty_metadata(handle,
+                                err = ext4_handle_dirty_metadata(handle,
-                                                                bitmap_bh);
+                                                                 inode,
+                                                        inode_bitmap_bh);
                                if (err)
                                        goto fail;
+                                /* zero bit is inode number 1*/
+                                ino++;
                                goto got;
                        }
                        /* we lost it */
-                        jbd2_journal_release_buffer(handle, bitmap_bh);
+                        ext4_handle_release_buffer(handle, inode_bitmap_bh);
+                        ext4_handle_release_buffer(handle, group_desc_bh);
                        if (++ino < EXT4_INODES_PER_GROUP(sb))
                                goto repeat_in_this_group;
@@ -684,30 +799,16 @@ repeat_in_this_group:
        goto out;
 got:
-        ino++;
-        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-            ino > EXT4_INODES_PER_GROUP(sb)) {
-                ext4_error(sb, __func__,
-                           "reserved inode or inode > inodes count - "
-                           "block_group = %lu, inode=%lu", group,
-                           ino + group * EXT4_INODES_PER_GROUP(sb));
-                err = -EIO;
-                goto fail;
-        }
-        BUFFER_TRACE(bh2, "get_write_access");
-        err = ext4_journal_get_write_access(handle, bh2);
-        if (err) goto fail;
        /* We may have to initialize the block bitmap if it isn't already */
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
+                struct buffer_head *block_bitmap_bh;
-                BUFFER_TRACE(block_bh, "get block bitmap access");
+                block_bitmap_bh = ext4_read_block_bitmap(sb, group);
-                err = ext4_journal_get_write_access(handle, block_bh);
+                BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+                err = ext4_journal_get_write_access(handle, block_bitmap_bh);
                if (err) {
-                        brelse(block_bh);
+                        brelse(block_bitmap_bh);
                        goto fail;
                }
@@ -715,9 +816,9 @@ got:
                spin_lock(sb_bgl_lock(sbi, group));
                /* recheck and clear flag under lock if we still need to */
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        free = ext4_free_blocks_after_init(sb, group, gdp);
-                        gdp->bg_free_blocks_count = cpu_to_le16(free);
+                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+                        ext4_free_blks_set(sb, gdp, free);
                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
                                                                gdp);
                }
@@ -725,55 +826,19 @@ got:
                /* Don't need to dirty bitmap block if we didn't change it */
                if (free) {
-                        BUFFER_TRACE(block_bh, "dirty block bitmap");
+                        BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
-                        err = ext4_journal_dirty_metadata(handle, block_bh);
+                        err = ext4_handle_dirty_metadata(handle,
+                                                        NULL, block_bitmap_bh);
                }
-                brelse(block_bh);
+                brelse(block_bitmap_bh);
                if (err)
                        goto fail;
        }
+        BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
-        spin_lock(sb_bgl_lock(sbi, group));
+        err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
-        /* If we didn't allocate from within the initialized part of the inode
+        if (err)
-         * table then we need to initialize up to this inode. */
+                goto fail;
-        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-                        /* When marking the block group with
-                         * ~EXT4_BG_INODE_UNINIT we don't want to depend
-                         * on the value of bg_itable_unused even though
-                         * mke2fs could have initialized the same for us.
-                         * Instead we calculated the value below
-                         */
-                        free = 0;
-                } else {
-                        free = EXT4_INODES_PER_GROUP(sb) -
-                                le16_to_cpu(gdp->bg_itable_unused);
-                }
-                /*
-                 * Check the relative inode number against the last used
-                 * relative inode number in this group. if it is greater
-                 * we need to  update the bg_itable_unused count
-                 *
-                 */
-                if (ino > free)
-                        gdp->bg_itable_unused =
-                                cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
-        }
-        le16_add_cpu(&gdp->bg_free_inodes_count, -1);
-        if (S_ISDIR(mode)) {
-                le16_add_cpu(&gdp->bg_used_dirs_count, 1);
-        }
-        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-        spin_unlock(sb_bgl_lock(sbi, group));
-        BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-        err = ext4_journal_dirty_metadata(handle, bh2);
-        if (err) goto fail;
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
@@ -825,8 +890,11 @@ got:
        ext4_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                err = -EINVAL;
+                goto fail_drop;
+        }
        spin_lock(&sbi->s_next_gen_lock);
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
@@ -849,7 +917,7 @@ got:
        if (err)
                goto fail_free_drop;
-        if (test_opt(sb, EXTENTS)) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
@@ -864,6 +932,8 @@ got:
        }
        ext4_debug("allocating inode %lu\n", inode->i_ino);
+        trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
+                   sb->s_id, inode->i_ino, dir->i_ino, mode);
        goto really_out;
 fail:
        ext4_std_error(sb, err);
@@ -871,7 +941,7 @@ out:
        iput(inode);
        ret = ERR_PTR(err);
 really_out:
-        brelse(bitmap_bh);
+        brelse(inode_bitmap_bh);
        return ret;
 fail_free_drop:
@@ -881,8 +951,9 @@ fail_drop:
        DQUOT_DROP(inode);
        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
+        unlock_new_inode(inode);
        iput(inode);
-        brelse(bitmap_bh);
+        brelse(inode_bitmap_bh);
        return ERR_PTR(err);
 }
@@ -981,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+                desc_count += ext4_free_inodes_count(sb, gdp);
                brelse(bitmap_bh);
                bitmap_bh = ext4_read_inode_bitmap(sb, i);
                if (!bitmap_bh)
@@ -989,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
                x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-                        i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+                        i, ext4_free_inodes_count(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
@@ -1003,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+                desc_count += ext4_free_inodes_count(sb, gdp);
                cond_resched();
        }
        return desc_count;
@@ -1020,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
-                count += le16_to_cpu(gdp->bg_used_dirs_count);
+                count += ext4_used_dirs_count(sb, gdp);
        }
        return count;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be21a5ae33cb..a6444cee0c7e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -34,6 +34,7 @@
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
+#include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
@@ -71,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
 * "bh" may be NULL: a metadata block may have been freed from memory
 * but there may still be a record of it in the journal, and that record
 * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling so there's nothing to do.
 */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
                        struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
        int err;
+        if (!ext4_handle_valid(handle))
+                return 0;
        might_sleep();
        BUFFER_TRACE(bh, "enter");
@@ -169,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
 */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
-        if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+        if (!ext4_handle_valid(handle))
+                return 0;
+        if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
                return 0;
        if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
                return 0;
@@ -183,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 */
 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 {
+        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
        return ext4_journal_restart(handle, blocks_for_truncate(inode));
 }
@@ -215,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
        }
        if (IS_SYNC(inode))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
@@ -232,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
         * enough credits left in the handle to remove the inode from
         * the orphan list and set the dtime field.
         */
-        if (handle->h_buffer_credits < 3) {
+        if (!ext4_handle_has_enough_credits(handle, 3)) {
                err = ext4_journal_extend(handle, 3);
                if (err > 0)
                        err = ext4_journal_restart(handle, 3);
@@ -505,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
 *      return the total number of blocks to be allocate, including the
 *      direct and indirect blocks.
 */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
                int blocks_to_boundary)
 {
-        unsigned long count = 0;
+        unsigned int count = 0;
        /*
         * Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -546,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                                int indirect_blks, int blks,
                                ext4_fsblk_t new_blocks[4], int *err)
 {
+        struct ext4_allocation_request ar;
        int target, i;
        unsigned long count = 0, blk_allocated = 0;
        int index = 0;
@@ -594,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
        if (!target)
                goto allocated;
        /* Now allocate data blocks */
-        count = target;
+        memset(&ar, 0, sizeof(ar));
-        /* allocating blocks for data blocks */
+        ar.inode = inode;
-        current_block = ext4_new_blocks(handle, inode, iblock,
+        ar.goal = goal;
-                                                goal, &count, err);
+        ar.len = target;
+        ar.logical = iblock;
+        if (S_ISREG(inode->i_mode))
+                /* enable in-core preallocation only for regular files */
+                ar.flags = EXT4_MB_HINT_DATA;
+        current_block = ext4_mb_new_blocks(handle, &ar, err);
        if (*err && (target == blks)) {
                /*
                 * if the allocation failed and we didn't allocate
@@ -613,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                 */
                        new_blocks[index] = current_block;
                }
-                blk_allocated += count;
+                blk_allocated += ar.len;
        }
 allocated:
        /* total number of blocks allocated for direct blocks */
@@ -708,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                err = ext4_journal_dirty_metadata(handle, bh);
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (err)
                        goto failed;
        }
@@ -791,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
                 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
                 */
                jbd_debug(5, "splicing indirect only\n");
-                BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
-                err = ext4_journal_dirty_metadata(handle, where->bh);
+                err = ext4_handle_dirty_metadata(handle, inode, where->bh);
                if (err)
                        goto err_out;
        } else {
@@ -839,10 +856,10 @@ err_out:
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
 */
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-                ext4_lblk_t iblock, unsigned long maxblocks,
+                                  ext4_lblk_t iblock, unsigned int maxblocks,
-                struct buffer_head *bh_result,
+                                  struct buffer_head *bh_result,
-                int create, int extend_disksize)
+                                  int create, int extend_disksize)
 {
        int err = -EIO;
        ext4_lblk_t offsets[4];
@@ -1044,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 * It returns the error in case of allocation failure.
 */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-                        unsigned long max_blocks, struct buffer_head *bh,
+                        unsigned int max_blocks, struct buffer_head *bh,
                        int create, int extend_disksize, int flag)
 {
        int retval;
@@ -1220,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                set_buffer_uptodate(bh);
                        }
                        unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        err = ext4_journal_dirty_metadata(handle, bh);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
                        if (!fatal)
                                fatal = err;
                } else {
@@ -1334,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
        pgoff_t index;
        unsigned from, to;
+        trace_mark(ext4_write_begin,
+                   "dev %s ino %lu pos %llu len %u flags %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, flags);
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1345,7 +1366,7 @@ retry:
                goto out;
        }
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                ext4_journal_stop(handle);
                ret = -ENOMEM;
@@ -1386,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        set_buffer_uptodate(bh);
-        return ext4_journal_dirty_metadata(handle, bh);
+        return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 /*
@@ -1405,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
+        trace_mark(ext4_ordered_write_end,
+                   "dev %s ino %lu pos %llu len %u copied %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, copied);
        ret = ext4_jbd2_file_inode(handle, inode);
        if (ret == 0) {
@@ -1443,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
        int ret = 0, ret2;
        loff_t new_i_size;
+        trace_mark(ext4_writeback_write_end,
+                   "dev %s ino %lu pos %llu len %u copied %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, copied);
        new_i_size = pos + copied;
        if (new_i_size > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, new_i_size);
@@ -1478,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
        unsigned from, to;
        loff_t new_i_size;
+        trace_mark(ext4_journalled_write_end,
+                   "dev %s ino %lu pos %llu len %u copied %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, copied);
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1624,7 +1657,7 @@ struct mpage_da_data {
        get_block_t *get_block;
        struct writeback_control *wbc;
        int io_done;
-        long pages_written;
+        int pages_written;
        int retval;
 };
@@ -1644,35 +1677,39 @@ struct mpage_da_data {
 */
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
-        struct address_space *mapping = mpd->inode->i_mapping;
-        int ret = 0, err, nr_pages, i;
-        unsigned long index, end;
-        struct pagevec pvec;
        long pages_skipped;
+        struct pagevec pvec;
+        unsigned long index, end;
+        int ret = 0, err, nr_pages, i;
+        struct inode *inode = mpd->inode;
+        struct address_space *mapping = inode->i_mapping;
        BUG_ON(mpd->next_page <= mpd->first_page);
-        pagevec_init(&pvec, 0);
+        /*
+         * We need to start from the first_page to the next_page - 1
+         * to make sure we also write the mapped dirty buffer_heads.
+         * If we look at mpd->lbh.b_blocknr we would only be looking
+         * at the currently mapped buffer_heads.
+         */
        index = mpd->first_page;
        end = mpd->next_page - 1;
+        pagevec_init(&pvec, 0);
        while (index <= end) {
-                /*
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
-                 * We can use PAGECACHE_TAG_DIRTY lookup here because
-                 * even though we have cleared the dirty flag on the page
-                 * We still keep the page in the radix tree with tag
-                 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
-                 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
-                 * which is called via the below writepage callback.
-                 */
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-                                        PAGECACHE_TAG_DIRTY,
-                                        min(end - index,
-                                        (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
                for (i = 0; i < nr_pages; i++) {
                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        BUG_ON(!PageLocked(page));
+                        BUG_ON(PageWriteback(page));
                        pages_skipped = mpd->wbc->pages_skipped;
                        err = mapping->a_ops->writepage(page, mpd->wbc);
                        if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -1830,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode)
                        ext4_count_free_blocks(inode->i_sb));
        printk(KERN_EMERG "Free/Dirty block details\n");
        printk(KERN_EMERG "free_blocks=%lld\n",
-                        percpu_counter_sum(&sbi->s_freeblocks_counter));
+                        (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
        printk(KERN_EMERG "dirty_blocks=%lld\n",
-                        percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+                        (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
        printk(KERN_EMERG "Block reservation details\n");
-        printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+        printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
                        EXT4_I(inode)->i_reserved_data_blocks);
-        printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+        printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
                        EXT4_I(inode)->i_reserved_meta_blocks);
        return;
 }
@@ -2086,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page,
                bh = head;
                do {
                        BUG_ON(buffer_locked(bh));
+                        /*
+                         * We need to try to allocate
+                         * unmapped blocks in the same page.
+                         * Otherwise we won't make progress
+                         * with the page in ext4_da_writepage
+                         */
                        if (buffer_dirty(bh) &&
                                (!buffer_mapped(bh) || buffer_delay(bh))) {
                                mpage_add_bh_to_extent(mpd, logical, bh);
                                if (mpd->io_done)
                                        return MPAGE_DA_EXTENT_TAIL;
+                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+                                /*
+                                 * mapped dirty buffer. We need to update
+                                 * the b_state because we look at
+                                 * b_state in mpage_da_map_blocks. We don't
+                                 * update b_size because if we find an
+                                 * unmapped buffer_head later we need to
+                                 * use the b_state flag of that buffer_head.
+                                 */
+                                if (mpd->lbh.b_size == 0)
+                                        mpd->lbh.b_state =
+                                                bh->b_state & BH_FLAGS;
                        }
                        logical++;
                } while ((bh = bh->b_this_page) != head);
@@ -2268,10 +2323,13 @@ static int ext4_da_writepage(struct page *page,
 {
        int ret = 0;
        loff_t size;
-        unsigned long len;
+        unsigned int len;
        struct buffer_head *page_bufs;
        struct inode *inode = page->mapping->host;
+        trace_mark(ext4_da_writepage,
+                   "dev %s ino %lu page_index %lu",
+                   inode->i_sb->s_id, inode->i_ino, page->index);
        size = i_size_read(inode);
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2377,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
        int no_nrwrite_index_update;
-        long pages_written = 0, pages_skipped;
+        int pages_written = 0;
+        long pages_skipped;
        int needed_blocks, ret = 0, nr_to_writebump = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+        trace_mark(ext4_da_writepages,
+                   "dev %s ino %lu nr_t_write %ld "
+                   "pages_skipped %ld range_start %llu "
+                   "range_end %llu nonblocking %d "
+                   "for_kupdate %d for_reclaim %d "
+                   "for_writepages %d range_cyclic %d",
+                   inode->i_sb->s_id, inode->i_ino,
+                   wbc->nr_to_write, wbc->pages_skipped,
+                   (unsigned long long) wbc->range_start,
+                   (unsigned long long) wbc->range_end,
+                   wbc->nonblocking, wbc->for_kupdate,
+                   wbc->for_reclaim, wbc->for_writepages,
+                   wbc->range_cyclic);
        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
@@ -2388,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping,
         */
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;
+        /*
+         * If the filesystem has aborted, it is read-only, so return
+         * right away instead of dumping stack traces later on that
+         * will obscure the real source of the problem.  We test
+         * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+         * the latter could be true if the filesystem is mounted
+         * read-only, and in that case, ext4_da_writepages should
+         * *never* be called, so if that ever happens, we would want
+         * the stack trace.
+         */
+        if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+                return -EROFS;
        /*
         * Make sure nr_to_write is >= sbi->s_mb_stream_request
         * This make sure small files blocks are allocated in
@@ -2432,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping,
                handle = ext4_journal_start(inode, needed_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                        printk(KERN_EMERG "%s: jbd2_start: "
+                        printk(KERN_CRIT "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d\n", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        dump_stack();
@@ -2485,6 +2572,14 @@ out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
+        trace_mark(ext4_da_writepage_result,
+                   "dev %s ino %lu ret %d pages_written %d "
+                   "pages_skipped %ld congestion %d "
+                   "more_io %d no_nrwrite_index_update %d",
+                   inode->i_sb->s_id, inode->i_ino, ret,
+                   pages_written, wbc->pages_skipped,
+                   wbc->encountered_congestion, wbc->more_io,
+                   wbc->no_nrwrite_index_update);
        return ret;
 }
@@ -2497,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb)
        /*
         * switch to non delalloc mode if we are running low
         * on free block. The free block accounting via percpu
-         * counters can get slightly wrong with FBC_BATCH getting
+         * counters can get slightly wrong with percpu_counter_batch getting
         * accumulated on each CPU without updating global counters
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
@@ -2536,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                        len, flags, pagep, fsdata);
        }
        *fsdata = (void *)0;
+        trace_mark(ext4_da_write_begin,
+                   "dev %s ino %lu pos %llu len %u flags %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, flags);
 retry:
        /*
         * With delayed allocation, we don't log the i_disksize update
@@ -2549,7 +2649,7 @@ retry:
                goto out;
        }
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page) {
                ext4_journal_stop(handle);
                ret = -ENOMEM;
@@ -2625,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
                }
        }
+        trace_mark(ext4_da_write_end,
+                   "dev %s ino %lu pos %llu len %u copied %u",
+                   inode->i_sb->s_id, inode->i_ino,
+                   (unsigned long long) pos, len, copied);
        start = pos & (PAGE_CACHE_SIZE - 1);
        end = start + copied - 1;
@@ -2717,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
                filemap_write_and_wait(mapping);
        }
-        if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+        BUG_ON(!EXT4_JOURNAL(inode) &&
+               EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
+        if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
                 * bmap on dirty files is expected to be extremely rare:
@@ -2835,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page,
        loff_t size = i_size_read(inode);
        loff_t len;
+        trace_mark(ext4_normal_writepage,
+                   "dev %s ino %lu page_index %lu",
+                   inode->i_sb->s_id, inode->i_ino, page->index);
        J_ASSERT(PageLocked(page));
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2920,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page,
        loff_t size = i_size_read(inode);
        loff_t len;
+        trace_mark(ext4_journalled_writepage,
+                   "dev %s ino %lu page_index %lu",
+                   inode->i_sb->s_id, inode->i_ino, page->index);
        J_ASSERT(PageLocked(page));
        if (page->index == size >> PAGE_CACHE_SHIFT)
                len = size & ~PAGE_CACHE_MASK;
@@ -2988,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
        if (offset == 0)
                ClearPageChecked(page);
-        jbd2_journal_invalidatepage(journal, page, offset);
+        if (journal)
+                jbd2_journal_invalidatepage(journal, page, offset);
+        else
+                block_invalidatepage(page, offset);
 }
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -2998,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
-        return jbd2_journal_try_to_free_buffers(journal, page, wait);
+        if (journal)
+                return jbd2_journal_try_to_free_buffers(journal, page, wait);
+        else
+                return try_to_free_buffers(page);
 }
 /*
@@ -3270,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle,
        err = 0;
        if (ext4_should_journal_data(inode)) {
-                err = ext4_journal_dirty_metadata(handle, bh);
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
        } else {
                if (ext4_should_order_data(inode))
                        err = ext4_jbd2_file_inode(handle, inode);
@@ -3394,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
        __le32 *p;
        if (try_to_extend_transaction(handle, inode)) {
                if (bh) {
-                        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_journal_dirty_metadata(handle, bh);
+                        ext4_handle_dirty_metadata(handle, inode, bh);
                }
                ext4_mark_inode_dirty(handle, inode);
                ext4_journal_test_restart(handle, inode);
@@ -3495,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                                  count, block_to_free_p, p);
        if (this_bh) {
-                BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
                /*
                 * The buffer head should have an attached journal head at this
@@ -3504,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                 * the block was cleared. Check for this instead of OOPSing.
                 */
                if (bh2jh(this_bh))
-                        ext4_journal_dirty_metadata(handle, this_bh);
+                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
                        ext4_error(inode->i_sb, __func__,
                                   "circular indirect block detected, "
@@ -3534,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
        ext4_fsblk_t nr;
        __le32 *p;
-        if (is_handle_aborted(handle))
+        if (ext4_handle_is_aborted(handle))
                return;
        if (depth--) {
@@ -3604,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * will merely complain about releasing a free block,
                         * rather than leaking blocks.
                         */
-                        if (is_handle_aborted(handle))
+                        if (ext4_handle_is_aborted(handle))
                                return;
                        if (try_to_extend_transaction(handle, inode)) {
                                ext4_mark_inode_dirty(handle, inode);
@@ -3623,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                                                   parent_bh)){
                                        *p = 0;
                                        BUFFER_TRACE(parent_bh,
-                                        "call ext4_journal_dirty_metadata");
+                                        "call ext4_handle_dirty_metadata");
-                                        ext4_journal_dirty_metadata(handle,
+                                        ext4_handle_dirty_metadata(handle,
-                                                                    parent_bh);
+                                                                   inode,
+                                                                   parent_bh);
                                }
                        }
                }
@@ -3813,7 +3933,7 @@ do_indirects:
         * synchronous
         */
        if (IS_SYNC(inode))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
 out_stop:
        /*
         * If this was a simple ftruncate(), and the file will remain alive
@@ -3843,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
        ext4_fsblk_t            block;
        int                     inodes_per_block, inode_offset;
-        iloc->bh = 0;
+        iloc->bh = NULL;
        if (!ext4_valid_inum(sb, inode->i_ino))
                return -EIO;
@@ -3950,7 +4070,7 @@ make_io:
                        num = EXT4_INODES_PER_GROUP(sb);
                        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
-                                num -= le16_to_cpu(gdp->bg_itable_unused);
+                                num -= ext4_itable_unused_count(sb, gdp);
                        table += num / inodes_per_block;
                        if (end > table)
                                end = table;
@@ -4164,9 +4284,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                inode->i_op = &ext4_dir_inode_operations;
                inode->i_fop = &ext4_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext4_inode_is_fast_symlink(inode))
+                if (ext4_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext4_fast_symlink_inode_operations;
-                else {
+                        nd_terminate_link(ei->i_data, inode->i_size,
+                                sizeof(ei->i_data) - 1);
+                } else {
                        inode->i_op = &ext4_symlink_inode_operations;
                        ext4_set_aops(inode);
                }
@@ -4310,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle,
                        EXT4_SET_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
                        sb->s_dirt = 1;
-                        handle->h_sync = 1;
+                        ext4_handle_sync(handle);
-                        err = ext4_journal_dirty_metadata(handle,
+                        err = ext4_handle_dirty_metadata(handle, inode,
                                        EXT4_SB(sb)->s_sbh);
                }
        }
@@ -4338,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle,
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
        }
+        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+        rc = ext4_handle_dirty_metadata(handle, inode, bh);
-        rc = ext4_journal_dirty_metadata(handle, bh);
        if (!err)
                err = rc;
        ei->i_state &= ~EXT4_STATE_NEW;
@@ -4403,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait)
        return ext4_force_commit(inode->i_sb);
 }
+int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
+{
+        int err = 0;
+        mark_buffer_dirty(bh);
+        if (inode && inode_needs_sync(inode)) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        ext4_error(inode->i_sb, __func__,
+                                   "IO error syncing inode, "
+                                   "inode=%lu, block=%llu",
+                                   inode->i_ino,
+                                   (unsigned long long)bh->b_blocknr);
+                        err = -EIO;
+                }
+        }
+        return err;
+}
 /*
 * ext4_setattr()
 *
@@ -4707,16 +4847,15 @@ int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                         struct ext4_iloc *iloc)
 {
-        int err = 0;
+        int err;
-        if (handle) {
-                err = ext4_get_inode_loc(inode, iloc);
+        err = ext4_get_inode_loc(inode, iloc);
-                if (!err) {
+        if (!err) {
-                        BUFFER_TRACE(iloc->bh, "get_write_access");
+                BUFFER_TRACE(iloc->bh, "get_write_access");
-                        err = ext4_journal_get_write_access(handle, iloc->bh);
+                err = ext4_journal_get_write_access(handle, iloc->bh);
-                        if (err) {
+                if (err) {
-                                brelse(iloc->bh);
+                        brelse(iloc->bh);
-                                iloc->bh = NULL;
+                        iloc->bh = NULL;
-                        }
                }
        }
        ext4_std_error(inode->i_sb, err);
@@ -4788,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
        might_sleep();
        err = ext4_reserve_inode_write(handle, inode, &iloc);
-        if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+        if (ext4_handle_valid(handle) &&
+            EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
            !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
                /*
                 * We need extra buffer credits since we may write into EA block
@@ -4840,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode)
        handle_t *current_handle = ext4_journal_current_handle();
        handle_t *handle;
+        if (!ext4_handle_valid(current_handle)) {
+                ext4_mark_inode_dirty(current_handle, inode);
+                return;
+        }
        handle = ext4_journal_start(inode, 2);
        if (IS_ERR(handle))
                goto out;
@@ -4877,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
                        BUFFER_TRACE(iloc.bh, "get_write_access");
                        err = jbd2_journal_get_write_access(handle, iloc.bh);
                        if (!err)
-                                err = ext4_journal_dirty_metadata(handle,
+                                err = ext4_handle_dirty_metadata(handle,
-                                                                  iloc.bh);
+                                                                 inode,
+                                                                 iloc.bh);
                        brelse(iloc.bh);
                }
        }
@@ -4904,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         */
        journal = EXT4_JOURNAL(inode);
+        if (!journal)
+                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;
@@ -4933,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                return PTR_ERR(handle);
        err = ext4_mark_inode_dirty(handle, inode);
-        handle->h_sync = 1;
+        ext4_handle_sync(handle);
        ext4_journal_stop(handle);
        ext4_std_error(inode->i_sb, err);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dc99b4776d58..42dc83fb247a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                        goto flags_out;
                }
                if (IS_SYNC(inode))
-                        handle->h_sync = 1;
+                        ext4_handle_sync(handle);
                err = ext4_reserve_inode_write(handle, inode, &iloc);
                if (err)
                        goto flags_err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad998f72e..918aec0c8a11 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
 * inode as:
 *
 *  {                        page                        }
- *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.  So for each group we
@@ -330,6 +330,18 @@
 *        object
 *
 */
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                        ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+                                                ext4_group_t group);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr += first + i;
                        blocknr +=
                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+                        ext4_grp_locked_error(sb, e4b->bd_group,
-                        ext4_error(sb, __func__, "double-free of inode"
+                                   __func__, "double-free of inode"
-                                   " %lu's block %llu(bit %u in group %lu)\n",
+                                   " %lu's block %llu(bit %u in group %u)",
                                   inode ? inode->i_ino : 0, blocknr,
                                   first + i, e4b->bd_group);
                }
@@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
                b2 = (unsigned char *) bitmap;
                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
                        if (b1[i] != b2[i]) {
-                                printk(KERN_ERR "corruption in group %lu "
+                                printk(KERN_ERR "corruption in group %u "
                                       "at byte %u(%u): %x in copy != %x "
                                       "on disk/prealloc\n",
                                       e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
        grp->bb_fragments = fragments;
        if (free != grp->bb_free) {
-                ext4_error(sb, __func__,
+                ext4_grp_locked_error(sb, group,  __func__,
-                        "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+                        "EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
                        group, free, grp->bb_free);
                /*
                 * If we intent to continue, we consider group descritor
@@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 * stored in the inode as
 *
 * {                        page                        }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 *
 *
 * one block each for bitmap and buddy information.
@@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if (bh[i] == NULL)
                        goto out;
-                if (buffer_uptodate(bh[i]) &&
+                if (bitmap_uptodate(bh[i]))
-                    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
                        continue;
                lock_buffer(bh[i]);
+                if (bitmap_uptodate(bh[i])) {
+                        unlock_buffer(bh[i]);
+                        continue;
+                }
                spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        ext4_init_block_bitmap(sb, bh[i],
                                                first_group + i, desc);
+                        set_bitmap_uptodate(bh[i]);
                        set_buffer_uptodate(bh[i]);
-                        unlock_buffer(bh[i]);
                        spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                        unlock_buffer(bh[i]);
                        continue;
                }
                spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                if (buffer_uptodate(bh[i])) {
+                        /*
+                         * if not uninit if bh is uptodate,
+                         * bitmap is also uptodate
+                         */
+                        set_bitmap_uptodate(bh[i]);
+                        unlock_buffer(bh[i]);
+                        continue;
+                }
                get_bh(bh[i]);
+                /*
+                 * submit the buffer_head for read. We can
+                 * safely mark the bitmap as uptodate now.
+                 * We do it here so the bitmap uptodate bit
+                 * get set with buffer lock held.
+                 */
+                set_bitmap_uptodate(bh[i]);
                bh[i]->b_end_io = end_buffer_read_sync;
                submit_bh(READ, bh[i]);
-                mb_debug("read bitmap for group %lu\n", first_group + i);
+                mb_debug("read bitmap for group %u\n", first_group + i);
        }
        /* wait for I/O completion */
@@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        err = 0;
        first_block = page->index * blocks_per_page;
+        /* init the page  */
+        memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
        for (i = 0; i < blocks_per_page; i++) {
                int group;
                struct ext4_group_info *grinfo;
@@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore == NULL);
                        mb_debug("put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
-                        memset(data, 0xff, blocksize);
                        grinfo = ext4_get_group_info(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
@@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        /*
                         * incore got set to the group block bitmap below
                         */
+                        ext4_lock_group(sb, group);
                        ext4_mb_generate_buddy(sb, data, incore, group);
+                        ext4_unlock_group(sb, group);
                        incore = NULL;
                } else {
                        /* this is block of bitmap */
@@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        /* mark all preallocated blks used in in-core bitmap */
                        ext4_mb_generate_from_pa(sb, data, group);
+                        ext4_mb_generate_from_freelist(sb, data, group);
                        ext4_unlock_group(sb, group);
                        /* set incore so that the buddy information can be
@@ -886,18 +922,20 @@ static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                                        struct ext4_buddy *e4b)
 {
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct inode *inode = sbi->s_buddy_cache;
        int blocks_per_page;
        int block;
        int pnum;
        int poff;
        struct page *page;
        int ret;
+        struct ext4_group_info *grp;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct inode *inode = sbi->s_buddy_cache;
-        mb_debug("load group %lu\n", group);
+        mb_debug("load group %u\n", group);
        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        grp = ext4_get_group_info(sb, group);
        e4b->bd_blkbits = sb->s_blocksize_bits;
        e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        e4b->bd_group = group;
        e4b->bd_buddy_page = NULL;
        e4b->bd_bitmap_page = NULL;
+        e4b->alloc_semp = &grp->alloc_sem;
+        /* Take the read lock on the group alloc
+         * sem. This would make sure a parallel
+         * ext4_mb_init_group happening on other
+         * groups mapped by the page is blocked
+         * till we are done with allocation
+         */
+        down_read(e4b->alloc_semp);
        /*
         * the buddy cache inode stores the block bitmap
@@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        page = find_get_page(inode->i_mapping, pnum);
        if (page == NULL || !PageUptodate(page)) {
                if (page)
+                        /*
+                         * drop the page reference and try
+                         * to get the page with lock. If we
+                         * are not uptodate that implies
+                         * somebody just created the page but
+                         * is yet to initialize the same. So
+                         * wait for it to initialize.
+                         */
                        page_cache_release(page);
                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
                if (page) {
@@ -985,6 +1040,9 @@ err:
                page_cache_release(e4b->bd_buddy_page);
        e4b->bd_buddy = NULL;
        e4b->bd_bitmap = NULL;
+        /* Done with the buddy cache */
+        up_read(e4b->alloc_semp);
        return ret;
 }
@@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
                page_cache_release(e4b->bd_bitmap_page);
        if (e4b->bd_buddy_page)
                page_cache_release(e4b->bd_buddy_page);
+        /* Done with the buddy cache */
+        if (e4b->alloc_semp)
+                up_read(e4b->alloc_semp);
 }
@@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                mb_clear_bit_atomic(lock, cur, bm);
+                if (lock)
+                        mb_clear_bit_atomic(lock, cur, bm);
+                else
+                        mb_clear_bit(cur, bm);
                cur++;
        }
 }
@@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                mb_set_bit_atomic(lock, cur, bm);
+                if (lock)
+                        mb_set_bit_atomic(lock, cur, bm);
+                else
+                        mb_set_bit(cur, bm);
                cur++;
        }
 }
@@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        blocknr += block;
                        blocknr +=
                            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-                        ext4_unlock_group(sb, e4b->bd_group);
+                        ext4_grp_locked_error(sb, e4b->bd_group,
-                        ext4_error(sb, __func__, "double-free of inode"
+                                   __func__, "double-free of inode"
-                                   " %lu's block %llu(bit %u in group %lu)\n",
+                                   " %lu's block %llu(bit %u in group %u)",
                                   inode ? inode->i_ino : 0, blocknr, block,
                                   e4b->bd_group);
-                        ext4_lock_group(sb, e4b->bd_group);
                }
                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
                e4b->bd_info->bb_counters[order]++;
@@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
        ac->ac_tail = ret & 0xffff;
        ac->ac_buddy = ret >> 16;
-        /* XXXXXXX: SUCH A HORRIBLE **CK */
+        /*
-        /*FIXME!! Why ? */
+         * take the page reference. We want the page to be pinned
+         * so that we don't get a ext4_mb_init_cache_call for this
+         * group until we update the bitmap. That would mean we
+         * double allocate blocks. The reference is dropped
+         * in ext4_mb_release_context
+         */
        ac->ac_bitmap_page = e4b->bd_bitmap_page;
        get_page(ac->ac_bitmap_page);
        ac->ac_buddy_page = e4b->bd_buddy_page;
        get_page(ac->ac_buddy_page);
+        /* on allocation we use ac to track the held semaphore */
+        ac->alloc_semp =  e4b->alloc_semp;
+        e4b->alloc_semp = NULL;
        /* store last allocated for subsequent stream allocation */
        if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
                spin_lock(&sbi->s_md_lock);
@@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
        struct ext4_free_extent ex;
        int max;
+        if (ac->ac_status == AC_STATUS_FOUND)
+                return;
        /*
         * We don't want to scan for a whole year
         */
@@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                         * free blocks even though group info says we
                         * we have free blocks
                         */
-                        ext4_error(sb, __func__, "%d free blocks as per "
+                        ext4_grp_locked_error(sb, e4b->bd_group,
-                                        "group info. But bitmap says 0\n",
+                                        __func__, "%d free blocks as per "
+                                        "group info. But bitmap says 0",
                                        free);
                        break;
                }
@@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
                mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
                BUG_ON(ex.fe_len <= 0);
                if (free < ex.fe_len) {
-                        ext4_error(sb, __func__, "%d free blocks as per "
+                        ext4_grp_locked_error(sb, e4b->bd_group,
-                                        "group info. But got %d blocks\n",
+                                        __func__, "%d free blocks as per "
+                                        "group info. But got %d blocks",
                                        free, ex.fe_len);
                        /*
                         * The number of free blocks differs. This mostly
@@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        return 0;
 }
+/*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        int groups_per_page;
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        groups_per_page = blocks_per_page >> 1;
+        if (groups_per_page == 0)
+                groups_per_page = 1;
+        /* read all groups the page covers into the cache */
+        for (i = 0; i < groups_per_page; i++) {
+                if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+                        break;
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                down_write_nested(&grp->alloc_sem, i);
+        }
+        return i;
+}
+void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+                                        ext4_group_t group, int locked_group)
+{
+        int i;
+        int block, pnum;
+        int blocks_per_page;
+        ext4_group_t first_group;
+        struct ext4_group_info *grp;
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        first_group = pnum * blocks_per_page / 2;
+        /* release locks on all the groups */
+        for (i = 0; i < locked_group; i++) {
+                grp = ext4_get_group_info(sb, first_group + i);
+                /* take all groups write allocation
+                 * semaphore. This make sure there is
+                 * no block allocation going on in any
+                 * of that groups
+                 */
+                up_write(&grp->alloc_sem);
+        }
+}
+static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+        int ret;
+        void *bitmap;
+        int blocks_per_page;
+        int block, pnum, poff;
+        int num_grp_locked = 0;
+        struct ext4_group_info *this_grp;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct inode *inode = sbi->s_buddy_cache;
+        struct page *page = NULL, *bitmap_page = NULL;
+        mb_debug("init group %lu\n", group);
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        this_grp = ext4_get_group_info(sb, group);
+        /*
+         * This ensures we don't add group
+         * to this buddy cache via resize
+         */
+        num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+        if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+                /*
+                 * somebody initialized the group
+                 * return without doing anything
+                 */
+                ret = 0;
+                goto err;
+        }
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (page) {
+                BUG_ON(page->mapping != inode->i_mapping);
+                ret = ext4_mb_init_cache(page, NULL);
+                if (ret) {
+                        unlock_page(page);
+                        goto err;
+                }
+                unlock_page(page);
+        }
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
+                goto err;
+        }
+        mark_page_accessed(page);
+        bitmap_page = page;
+        bitmap = page_address(page) + (poff * sb->s_blocksize);
+        /* init buddy cache */
+        block++;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (page == bitmap_page) {
+                /*
+                 * If both the bitmap and buddy are in
+                 * the same page we don't need to force
+                 * init the buddy
+                 */
+                unlock_page(page);
+        } else if (page) {
+                BUG_ON(page->mapping != inode->i_mapping);
+                ret = ext4_mb_init_cache(page, bitmap);
+                if (ret) {
+                        unlock_page(page);
+                        goto err;
+                }
+                unlock_page(page);
+        }
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
+                goto err;
+        }
+        mark_page_accessed(page);
+err:
+        ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+        if (bitmap_page)
+                page_cache_release(bitmap_page);
+        if (page)
+                page_cache_release(page);
+        return ret;
+}
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -1775,7 +2019,7 @@ repeat:
                                group = 0;
                        /* quick check to skip empty groups */
-                        grp = ext4_get_group_info(ac->ac_sb, group);
+                        grp = ext4_get_group_info(sb, group);
                        if (grp->bb_free == 0)
                                continue;
@@ -1788,10 +2032,9 @@ repeat:
                                 * we need full data about the group
                                 * to make a good selection
                                 */
-                                err = ext4_mb_load_buddy(sb, group, &e4b);
+                                err = ext4_mb_init_group(sb, group);
                                if (err)
                                        goto out;
-                                ext4_mb_release_desc(&e4b);
                        }
                        /*
@@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
        if (hs->op == EXT4_MB_HISTORY_ALLOC) {
                fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
                        "%-5u %-5s %-5u %-6u\n";
-                sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+                sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len,
                        hs->result.fe_logical);
-                sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+                sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
                        hs->orig.fe_start, hs->orig.fe_len,
                        hs->orig.fe_logical);
-                sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
+                sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
                        hs->goal.fe_start, hs->goal.fe_len,
                        hs->goal.fe_logical);
                seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
                                hs->buddy ? 1 << hs->buddy : 0);
        } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
                fmt = "%-5u %-8u %-23s %-23s %-23s\n";
-                sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+                sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len,
                        hs->result.fe_logical);
-                sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+                sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
                        hs->orig.fe_start, hs->orig.fe_len,
                        hs->orig.fe_logical);
                seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
        } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
-                sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+                sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len);
                seq_printf(seq, "%-5u %-8u %-23s discard\n",
                                hs->pid, hs->ino, buf2);
        } else if (hs->op == EXT4_MB_HISTORY_FREE) {
-                sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+                sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len);
                seq_printf(seq, "%-5u %-8u %-23s free\n",
                                hs->pid, hs->ino, buf2);
@@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
                return NULL;
        group = *pos + 1;
-        return (void *) group;
+        return (void *) ((unsigned long) group);
 }
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
        if (*pos < 0 || *pos >= sbi->s_groups_count)
                return NULL;
        group = *pos + 1;
-        return (void *) group;;
+        return (void *) ((unsigned long) group);
 }
 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 {
        struct super_block *sb = seq->private;
-        long group = (long) v;
+        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
        int i;
        int err;
        struct ext4_buddy e4b;
@@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
                sizeof(struct ext4_group_info);
        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
-                seq_printf(seq, "#%-5lu: I/O error\n", group);
+                seq_printf(seq, "#%-5u: I/O error\n", group);
                return 0;
        }
        ext4_lock_group(sb, group);
@@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        ext4_unlock_group(sb, group);
        ext4_mb_release_desc(&e4b);
-        seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                        sg.info.bb_fragments, sg.info.bb_first_free);
        for (i = 0; i <= 13; i++)
                seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
                        ext4_free_blocks_after_init(sb, group, desc);
        } else {
                meta_group_info[i]->bb_free =
-                        le16_to_cpu(desc->bg_free_blocks_count);
+                        ext4_free_blks_count(sb, desc);
        }
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root.rb_node = NULL;;
 #ifdef DOUBLE_CHECK
@@ -2327,54 +2571,6 @@ exit_meta_group_info:
 } /* ext4_mb_add_groupinfo */
 /*
- * Add a group to the existing groups.
- * This function is used for online resize
- */
-int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
-                               struct ext4_group_desc *desc)
-{
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct inode *inode = sbi->s_buddy_cache;
-        int blocks_per_page;
-        int block;
-        int pnum;
-        struct page *page;
-        int err;
-        /* Add group based on group descriptor*/
-        err = ext4_mb_add_groupinfo(sb, group, desc);
-        if (err)
-                return err;
-        /*
-         * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
-         * datas) are set not up to date so that they will be re-initilaized
-         * during the next call to ext4_mb_load_buddy
-         */
-        /* Set buddy page as not up to date */
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        page = find_get_page(inode->i_mapping, pnum);
-        if (page != NULL) {
-                ClearPageUptodate(page);
-                page_cache_release(page);
-        }
-        /* Set bitmap page as not up to date */
-        block++;
-        pnum = block / blocks_per_page;
-        page = find_get_page(inode->i_mapping, pnum);
-        if (page != NULL) {
-                ClearPageUptodate(page);
-                page_cache_release(page);
-        }
-        return 0;
-}
-/*
 * Update an existing group.
 * This function is used for online resize
 */
@@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
-                                "EXT4-fs: can't read descriptor %lu\n", i);
+                                "EXT4-fs: can't read descriptor %u\n", i);
                        goto err_freebuddy;
                }
                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        if (sbi->s_mb_offsets == NULL) {
                return -ENOMEM;
        }
+        i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
                kfree(sbi->s_mb_maxs);
@@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        ext4_mb_init_per_dev_proc(sb);
        ext4_mb_history_init(sb);
-        sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+        if (sbi->s_journal)
+                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
        printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
        return 0;
@@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        list_for_each_safe(l, ltmp, &txn->t_private_list) {
                entry = list_entry(l, struct ext4_free_data, list);
-                mb_debug("gonna free %u blocks in group %lu (0x%p):",
+                mb_debug("gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
                        + entry->start_blk
                        + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-                trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
+                trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
-                           (unsigned long long) discard_block, entry->count);
+                           sb->s_id, (unsigned long long) discard_block,
+                           entry->count);
                sb_issue_discard(sb, discard_block, entry->count);
                kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void)
 */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-                                handle_t *handle, unsigned long reserv_blks)
+                                handle_t *handle, unsigned int reserv_blks)
 {
        struct buffer_head *bitmap_bh = NULL;
        struct ext4_super_block *es;
@@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (!gdp)
                goto out_err;
-        ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
                        gdp->bg_free_blocks_count);
        err = ext4_journal_get_write_access(handle, gdp_bh);
@@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
            in_range(block + len - 1, ext4_inode_table(sb, gdp),
                     EXT4_SB(sb)->s_itb_per_group)) {
                ext4_error(sb, __func__,
-                           "Allocating block in system zone - block = %llu",
+                           "Allocating block %llu in system zone of %d group\n",
-                           block);
+                           block, ac->ac_b_ex.fe_group);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
                 * We leak some of the blocks here.
@@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
                                bitmap_bh->b_data, ac->ac_b_ex.fe_start,
                                ac->ac_b_ex.fe_len);
-                err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
                        err = -EAGAIN;
                goto out_err;
@@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                }
        }
 #endif
-        mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
-                                ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
        spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        mb_set_bits(NULL, bitmap_bh->b_data,
+                                ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-                gdp->bg_free_blocks_count =
+                ext4_free_blks_set(sb, gdp,
-                        cpu_to_le16(ext4_free_blocks_after_init(sb,
+                                        ext4_free_blocks_after_init(sb,
-                                                ac->ac_b_ex.fe_group,
+                                        ac->ac_b_ex.fe_group, gdp));
-                                                gdp));
        }
-        le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
+        len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+        ext4_free_blks_set(sb, gdp, len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
-        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        if (err)
                goto out_err;
-        err = ext4_journal_dirty_metadata(handle, gdp_bh);
+        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
 out_err:
        sb->s_dirt = 1;
@@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        /* check we don't cross already preallocated blocks */
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-                unsigned long pa_end;
+                ext4_lblk_t pa_end;
                if (pa->pa_deleted)
                        continue;
@@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
        /* XXX: extra loop to check we really don't overlap preallocations */
        rcu_read_lock();
        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-                unsigned long pa_end;
+                ext4_lblk_t pa_end;
                spin_lock(&pa->pa_lock);
                if (pa->pa_deleted == 0) {
                        pa_end = pa->pa_lstart + pa->pa_len;
@@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 }
 /*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+                                                ext4_group_t group)
+{
+        struct rb_node *n;
+        struct ext4_group_info *grp;
+        struct ext4_free_data *entry;
+        grp = ext4_get_group_info(sb, group);
+        n = rb_first(&(grp->bb_free_root));
+        while (n) {
+                entry = rb_entry(n, struct ext4_free_data, node);
+                mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                                bitmap, entry->start_blk,
+                                entry->count);
+                n = rb_next(n);
+        }
+        return;
+}
+/*
 * the function goes through all preallocation in this group and marks them
 * used in in-core bitmap. buddy must be generated from this bitmap
 * Need to be called with ext4 group lock (ext4_lock_group)
@@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                preallocated += len;
                count++;
        }
-        mb_debug("prellocated %u for group %lu\n", preallocated, group);
+        mb_debug("prellocated %u for group %u\n", preallocated, group);
 }
 static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
                        struct super_block *sb, struct ext4_prealloc_space *pa)
 {
-        unsigned long grp;
+        ext4_group_t grp;
        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
                return;
@@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+        trace_mark(ext4_mb_new_inode_pa,
+                   "dev %s ino %lu pstart %llu len %u lstart %u",
+                   sb->s_id, ac->ac_inode->i_ino,
+                   pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        ext4_mb_use_inode_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        pa->pa_linear = 1;
        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
-                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+                 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+        trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
+                   sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        ext4_mb_use_group_pa(ac, pa);
        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 {
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        unsigned long end;
+        unsigned int end;
-        unsigned long next;
+        unsigned int next;
        ext4_group_t group;
        ext4_grpblk_t bit;
+        unsigned long long grp_blk_start;
        sector_t start;
        int err = 0;
        int free = 0;
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+        grp_blk_start = pa->pa_pstart - bit;
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
        end = bit + pa->pa_len;
@@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        ext4_mb_store_history(ac);
                }
+                trace_mark(ext4_mb_release_inode_pa,
+                           "dev %s ino %lu block %llu count %u",
+                           sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
+                           next - bit);
                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
                bit = next + 1;
        }
@@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                        pa, (unsigned long) pa->pa_lstart,
                        (unsigned long) pa->pa_pstart,
                        (unsigned long) pa->pa_len);
-                ext4_error(sb, __func__, "free %u, pa_free %u\n",
+                ext4_grp_locked_error(sb, group,
-                                                free, pa->pa_free);
+                                        __func__, "free %u, pa_free %u",
+                                        free, pa->pa_free);
                /*
                 * pa is already deleted so we use the value obtained
                 * from the bitmap and continue.
@@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
        if (ac)
                ac->ac_op = EXT4_MB_HISTORY_DISCARD;
+        trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
+                   sb->s_id, pa->pa_pstart, pa->pa_len);
        BUG_ON(pa->pa_deleted == 0);
        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        int busy = 0;
        int free = 0;
-        mb_debug("discard preallocation for group %lu\n", group);
+        mb_debug("discard preallocation for group %u\n", group);
        if (list_empty(&grp->bb_prealloc_list))
                return 0;
@@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        bitmap_bh = ext4_read_block_bitmap(sb, group);
        if (bitmap_bh == NULL) {
                ext4_error(sb, __func__, "Error in reading block "
-                                "bitmap for %lu\n", group);
+                                "bitmap for %u", group);
                return 0;
        }
        err = ext4_mb_load_buddy(sb, group, &e4b);
        if (err) {
                ext4_error(sb, __func__, "Error in loading buddy "
-                                "information for %lu\n", group);
+                                "information for %u", group);
                put_bh(bitmap_bh);
                return 0;
        }
@@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
        }
        mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+        trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
+                   inode->i_ino);
        INIT_LIST_HEAD(&list);
@@ -3874,14 +4116,14 @@ repeat:
                err = ext4_mb_load_buddy(sb, group, &e4b);
                if (err) {
                        ext4_error(sb, __func__, "Error in loading buddy "
-                                        "information for %lu\n", group);
+                                        "information for %u", group);
                        continue;
                }
                bitmap_bh = ext4_read_block_bitmap(sb, group);
                if (bitmap_bh == NULL) {
                        ext4_error(sb, __func__, "Error in reading block "
-                                        "bitmap for %lu\n", group);
+                                        "bitmap for %u", group);
                        ext4_mb_release_desc(&e4b);
                        continue;
                }
@@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        ext4_group_t group;
-        unsigned long len;
+        unsigned int len;
-        unsigned long goal;
+        ext4_fsblk_t goal;
        ext4_grpblk_t block;
        /* we can't allocate > group size */
@@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
        ac->ac_pa = NULL;
        ac->ac_bitmap_page = NULL;
        ac->ac_buddy_page = NULL;
+        ac->alloc_semp = NULL;
        ac->ac_lg = NULL;
        /* we have to define context: we'll we work with a file or
@@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
                if (ext4_mb_load_buddy(sb, group, &e4b)) {
                        ext4_error(sb, __func__, "Error in loading buddy "
-                                        "information for %lu\n", group);
+                                        "information for %u", group);
                        continue;
                }
                ext4_lock_group(sb, group);
@@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                }
                ext4_mb_put_pa(ac, ac->ac_sb, pa);
        }
+        if (ac->alloc_semp)
+                up_read(ac->alloc_semp);
        if (ac->ac_bitmap_page)
                page_cache_release(ac->ac_bitmap_page);
        if (ac->ac_buddy_page)
@@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
        int ret;
        int freed = 0;
+        trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
+                   sb->s_id, needed);
        for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
                freed += ret;
@@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        ext4_fsblk_t block = 0;
-        unsigned long inquota;
+        unsigned int inquota;
-        unsigned long reserv_blks = 0;
+        unsigned int reserv_blks = 0;
        sb = ar->inode->i_sb;
        sbi = EXT4_SB(sb);
+        trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
+                   "lblk %llu goal %llu lleft %llu lright %llu "
+                   "pleft %llu pright %llu ",
+                   sb->s_id, ar->flags, ar->len,
+                   ar->inode ? ar->inode->i_ino : 0,
+                   (unsigned long long) ar->logical,
+                   (unsigned long long) ar->goal,
+                   (unsigned long long) ar->lleft,
+                   (unsigned long long) ar->lright,
+                   (unsigned long long) ar->pleft,
+                   (unsigned long long) ar->pright);
        if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
                /*
                 * With delalloc we already reserved the blocks
@@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
        }
        if (ar->len == 0) {
                *errp = -EDQUOT;
-                return 0;
+                goto out3;
        }
        inquota = ar->len;
@@ -4348,10 +4607,14 @@ repeat:
                                ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
                        ext4_mb_new_preallocation(ac);
        }
        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
                if (*errp ==  -EAGAIN) {
+                        /*
+                         * drop the reference that we took
+                         * in ext4_mb_use_best_found
+                         */
+                        ext4_mb_release_context(ac);
                        ac->ac_b_ex.fe_group = 0;
                        ac->ac_b_ex.fe_start = 0;
                        ac->ac_b_ex.fe_len = 0;
@@ -4382,6 +4645,26 @@ out2:
 out1:
        if (ar->len < inquota)
                DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+out3:
+        if (!ar->len) {
+                if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+                        /* release all the reserved blocks if non delalloc */
+                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                                reserv_blks);
+        }
+        trace_mark(ext4_allocate_blocks,
+                   "dev %s block %llu flags %u len %u ino %lu "
+                   "logical %llu goal %llu lleft %llu lright %llu "
+                   "pleft %llu pright %llu ",
+                   sb->s_id, (unsigned long long) block,
+                   ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
+                   (unsigned long long) ar->logical,
+                   (unsigned long long) ar->goal,
+                   (unsigned long long) ar->lleft,
+                   (unsigned long long) ar->lright,
+                   (unsigned long long) ar->pleft,
+                   (unsigned long long) ar->pright);
        return block;
 }
@@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1,
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
-                          ext4_group_t group, ext4_grpblk_t block, int count)
+                      struct ext4_free_data *new_entry)
 {
+        ext4_grpblk_t block;
+        struct ext4_free_data *entry;
        struct ext4_group_info *db = e4b->bd_info;
        struct super_block *sb = e4b->bd_sb;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_free_data *entry, *new_entry;
        struct rb_node **n = &db->bb_free_root.rb_node, *node;
        struct rb_node *parent = NULL, *new_node;
+        BUG_ON(!ext4_handle_valid(handle));
        BUG_ON(e4b->bd_bitmap_page == NULL);
        BUG_ON(e4b->bd_buddy_page == NULL);
-        new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
-        new_entry->start_blk = block;
-        new_entry->group  = group;
-        new_entry->count = count;
-        new_entry->t_tid = handle->h_transaction->t_tid;
        new_node = &new_entry->node;
+        block = new_entry->start_blk;
-        ext4_lock_group(sb, group);
        if (!*n) {
                /* first free block exent. We need to
                   protect buddy cache from being freed,
@@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
                else if (block >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
-                        ext4_unlock_group(sb, group);
+                        ext4_grp_locked_error(sb, e4b->bd_group, __func__,
-                        ext4_error(sb, __func__,
+                                        "Double free of blocks %d (%d %d)",
-                            "Double free of blocks %d (%d %d)\n",
+                                        block, entry->start_blk, entry->count);
-                            block, entry->start_blk, entry->count);
                        return 0;
                }
        }
@@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
        spin_lock(&sbi->s_md_lock);
        list_add(&new_entry->list, &handle->h_transaction->t_private_list);
        spin_unlock(&sbi->s_md_lock);
-        ext4_unlock_group(sb, group);
        return 0;
 }
@@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        struct ext4_allocation_context *ac = NULL;
        struct ext4_group_desc *gdp;
        struct ext4_super_block *es;
-        unsigned long overflow;
+        unsigned int overflow;
        ext4_grpblk_t bit;
        struct buffer_head *gd_bh;
        ext4_group_t block_group;
@@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
        }
        ext4_debug("freeing block %lu\n", block);
+        trace_mark(ext4_free_blocks,
+                   "dev %s block %llu count %lu metadata %d ino %lu",
+                   sb->s_id, (unsigned long long) block, count, metadata,
+                   inode ? inode->i_ino : 0);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
        if (ac) {
@@ -4581,11 +4862,6 @@ do_more:
        err = ext4_journal_get_write_access(handle, gd_bh);
        if (err)
                goto error_return;
-        err = ext4_mb_load_buddy(sb, block_group, &e4b);
-        if (err)
-                goto error_return;
 #ifdef AGGRESSIVE_CHECK
        {
                int i;
@@ -4593,13 +4869,6 @@ do_more:
                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
        }
 #endif
-        mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-                        bit, count);
-        /* We dirtied the bitmap block */
-        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-        err = ext4_journal_dirty_metadata(handle, bitmap_bh);
        if (ac) {
                ac->ac_b_ex.fe_group = block_group;
                ac->ac_b_ex.fe_start = bit;
@@ -4607,19 +4876,41 @@ do_more:
                ext4_mb_store_history(ac);
        }
-        if (metadata) {
+        err = ext4_mb_load_buddy(sb, block_group, &e4b);
-                /* blocks being freed are metadata. these blocks shouldn't
+        if (err)
-                 * be used until this transaction is committed */
+                goto error_return;
-                ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+        if (metadata && ext4_handle_valid(handle)) {
+                struct ext4_free_data *new_entry;
+                /*
+                 * blocks being freed are metadata. these blocks shouldn't
+                 * be used until this transaction is committed
+                 */
+                new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+                new_entry->start_blk = bit;
+                new_entry->group  = block_group;
+                new_entry->count = count;
+                new_entry->t_tid = handle->h_transaction->t_tid;
+                ext4_lock_group(sb, block_group);
+                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                                bit, count);
+                ext4_mb_free_metadata(handle, &e4b, new_entry);
+                ext4_unlock_group(sb, block_group);
        } else {
                ext4_lock_group(sb, block_group);
+                /* need to update group_info->bb_free and bitmap
+                 * with group lock held. generate_buddy look at
+                 * them with group lock_held
+                 */
+                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                                bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
                ext4_unlock_group(sb, block_group);
        }
        spin_lock(sb_bgl_lock(sbi, block_group));
-        le16_add_cpu(&gdp->bg_free_blocks_count, count);
+        ret = ext4_free_blks_count(sb, gdp) + count;
+        ext4_free_blks_set(sb, gdp, ret);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
        spin_unlock(sb_bgl_lock(sbi, block_group));
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
@@ -4635,9 +4926,13 @@ do_more:
        *freed += count;
+        /* We dirtied the bitmap block */
+        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
        /* And the group descriptor block */
        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-        ret = ext4_journal_dirty_metadata(handle, gd_bh);
+        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
        if (!err)
                err = ret;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1fff1e5..10a2921baf14 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
 #include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/marker.h>
+#include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "group.h"
@@ -98,9 +99,6 @@
 */
 #define MB_DEFAULT_GROUP_PREALLOC       512
-static struct kmem_cache *ext4_pspace_cachep;
-static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
 struct ext4_free_data {
        /* this links the free block information from group_info */
@@ -120,26 +118,6 @@ struct ext4_free_data {
        tid_t   t_tid;
 };
-struct ext4_group_info {
-        unsigned long   bb_state;
-        struct rb_root  bb_free_root;
-        unsigned short  bb_first_free;
-        unsigned short  bb_free;
-        unsigned short  bb_fragments;
-        struct          list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
-        void            *bb_bitmap;
-#endif
-        unsigned short  bb_counters[];
-};
-#define EXT4_GROUP_INFO_NEED_INIT_BIT   0
-#define EXT4_GROUP_INFO_LOCKED_BIT      1
-#define EXT4_MB_GRP_NEED_INIT(grp)      \
-        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 struct ext4_prealloc_space {
        struct list_head        pa_inode_list;
        struct list_head        pa_group_list;
@@ -217,6 +195,11 @@ struct ext4_allocation_context {
        __u8 ac_op;             /* operation, for history only */
        struct page *ac_bitmap_page;
        struct page *ac_buddy_page;
+        /*
+         * pointer to the held semaphore upon successful
+         * block allocation
+         */
+        struct rw_semaphore *alloc_semp;
        struct ext4_prealloc_space *ac_pa;
        struct ext4_locality_group *ac_lg;
 };
@@ -250,6 +233,7 @@ struct ext4_buddy {
        struct super_block *bd_sb;
        __u16 bd_blkbits;
        ext4_group_t bd_group;
+        struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)     ((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)      ((e4b)->bd_buddy)
@@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 {
        return;
 }
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 #endif
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
-                                        ext4_group_t group);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-                                        struct ext4_buddy *e4b, sector_t block,
-                                        int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
-                        struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-        bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-static inline void ext4_unlock_group(struct super_block *sb,
-                                        ext4_group_t group)
-{
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-        bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-static inline int ext4_is_group_locked(struct super_block *sb,
-                                        ext4_group_t group)
-{
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-        return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-                                                &(grinfo->bb_state));
-}
-static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
        ext4_fsblk_t block;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f2a9cf498ecd..734abca25e35 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
        /*
         * Make sure the credit we accumalated is not really high
         */
-        if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+        if (needed && ext4_handle_has_enough_credits(handle,
+                                                EXT4_RESERVE_TRANS_BLOCKS)) {
                retval = ext4_journal_restart(handle, needed);
                if (retval)
                        goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
 {
        int retval = 0, needed;
-        if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+        if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
                return 0;
        /*
         * We are freeing a blocks. During this we touch
@@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
        struct list_blocks_struct lb;
        unsigned long max_entries;
-        if (!test_opt(inode->i_sb, EXTENTS))
+        /*
-                /*
+         * If the filesystem does not support extents, or the inode
-                 * if mounted with noextents we don't allow the migrate
+         * already is extent-based, error out.
-                 */
+         */
-                return -EINVAL;
+        if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
-        if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+            (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
                return -EINVAL;
        if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 63adcb792988..fec0b4c2f5f1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
@@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                goto fail;
        }
        hinfo->hash_version = root->info.hash_version;
+        if (hinfo->hash_version <= DX_HASH_TEA)
+                hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        if (d_name)
                ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        dir = dir_file->f_path.dentry->d_inode;
        if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+                if (hinfo.hash_version <= DX_HASH_TEA)
+                        hinfo.hash_version +=
+                                EXT4_SB(dir->i_sb)->s_hash_unsigned;
                hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
                count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
                                               start_hash, start_minor_hash);
@@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name,
 static inline int search_dirblock(struct buffer_head *bh,
                                  struct inode *dir,
                                  const struct qstr *d_name,
-                                  unsigned long offset,
+                                  unsigned int offset,
                                  struct ext4_dir_entry_2 ** res_dir)
 {
        struct ext4_dir_entry_2 * de;
@@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
        bh = ext4_find_entry(dir, &dentry->d_name, &de);
        inode = NULL;
        if (bh) {
-                unsigned long ino = le32_to_cpu(de->inode);
+                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
                        ext4_error(dir->i_sb, "ext4_lookup",
-                                   "bad inode number: %lu", ino);
+                                   "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
@@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 struct dentry *ext4_get_parent(struct dentry *child)
 {
-        unsigned long ino;
+        __u32 ino;
        struct inode *inode;
        static const struct qstr dotdot = {
                .name = "..",
@@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
                ext4_error(child->d_inode->i_sb, "ext4_get_parent",
-                           "bad inode number: %lu", ino);
+                           "bad inode number: %u", ino);
                return ERR_PTR(-EIO);
        }
@@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        u32 hash2;
        struct dx_map_entry *map;
        char *data1 = (*bh)->b_data, *data2;
-        unsigned split, move, size, i;
+        unsigned split, move, size;
        struct ext4_dir_entry_2 *de = NULL, *de2;
-        int     err = 0;
+        int     err = 0, i;
        bh2 = ext4_append (handle, dir, &newblock, &err);
        if (!(bh2)) {
@@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                de = de2;
        }
        dx_insert_block(frame, hash2 + continued, newblock);
-        err = ext4_journal_dirty_metadata(handle, bh2);
+        err = ext4_handle_dirty_metadata(handle, dir, bh2);
        if (err)
                goto journal_error;
-        err = ext4_journal_dirty_metadata(handle, frame->bh);
+        err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
        if (err)
                goto journal_error;
        brelse(bh2);
@@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        struct inode    *dir = dentry->d_parent->d_inode;
        const char      *name = dentry->d_name.name;
        int             namelen = dentry->d_name.len;
-        unsigned long   offset = 0;
+        unsigned int    offset = 0;
        unsigned short  reclen;
        int             nlen, rlen, err;
        char            *top;
@@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        ext4_update_dx_flag(dir);
        dir->i_version++;
        ext4_mark_inode_dirty(handle, dir);
-        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-        err = ext4_journal_dirty_metadata(handle, bh);
+        err = ext4_handle_dirty_metadata(handle, dir, bh);
        if (err)
                ext4_std_error(dir->i_sb, err);
        brelse(bh);
@@ -1408,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* Initialize as for dx_probe */
        hinfo.hash_version = root->info.hash_version;
+        if (hinfo.hash_version <= DX_HASH_TEA)
+                hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
        ext4fs_dirhash(name, namelen, &hinfo);
        frame = frames;
@@ -1437,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                          struct inode *inode)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        unsigned long offset;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
        struct super_block *sb;
@@ -1459,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                ext4_mark_inode_dirty(handle, dir);
        }
        blocks = dir->i_size >> sb->s_blocksize_bits;
-        for (block = 0, offset = 0; block < blocks; block++) {
+        for (block = 0; block < blocks; block++) {
                bh = ext4_bread(handle, dir, block, 0, &retval);
                if(!bh)
                        return retval;
@@ -1574,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        dxtrace(dx_show_index("node", frames[1].entries));
                        dxtrace(dx_show_index("node",
                               ((struct dx_node *) bh2->b_data)->entries));
-                        err = ext4_journal_dirty_metadata(handle, bh2);
+                        err = ext4_handle_dirty_metadata(handle, inode, bh2);
                        if (err)
                                goto journal_error;
                        brelse (bh2);
@@ -1600,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-                ext4_journal_dirty_metadata(handle, frames[0].bh);
+                ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
        }
        de = do_split(handle, dir, &bh, frame, &hinfo, &err);
        if (!de)
@@ -1646,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle,
                        else
                                de->inode = 0;
                        dir->i_version++;
-                        BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        ext4_journal_dirty_metadata(handle, bh);
+                        ext4_handle_dirty_metadata(handle, dir, bh);
                        return 0;
                }
                i += ext4_rec_len_from_disk(de->rec_len);
@@ -1693,9 +1695,11 @@ static int ext4_add_nondir(handle_t *handle,
        if (!err) {
                ext4_mark_inode_dirty(handle, inode);
                d_instantiate(dentry, inode);
+                unlock_new_inode(inode);
                return 0;
        }
        drop_nlink(inode);
+        unlock_new_inode(inode);
        iput(inode);
        return err;
 }
@@ -1723,7 +1727,7 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = ext4_new_inode (handle, dir, mode);
        err = PTR_ERR(inode);
@@ -1757,7 +1761,7 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = ext4_new_inode(handle, dir, mode);
        err = PTR_ERR(inode);
@@ -1793,7 +1797,7 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
        err = PTR_ERR(inode);
@@ -1822,14 +1826,15 @@ retry:
        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        inode->i_nlink = 2;
-        BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
+        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-        ext4_journal_dirty_metadata(handle, dir_block);
+        ext4_handle_dirty_metadata(handle, dir, dir_block);
        brelse(dir_block);
        ext4_mark_inode_dirty(handle, inode);
        err = ext4_add_entry(handle, dentry, inode);
        if (err) {
 out_clear_inode:
                clear_nlink(inode);
+                unlock_new_inode(inode);
                ext4_mark_inode_dirty(handle, inode);
                iput(inode);
                goto out_stop;
@@ -1838,6 +1843,7 @@ out_clear_inode:
        ext4_update_dx_flag(dir);
        ext4_mark_inode_dirty(handle, dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
 out_stop:
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
@@ -1850,7 +1856,7 @@ out_stop:
 */
 static int empty_dir(struct inode *inode)
 {
-        unsigned long offset;
+        unsigned int offset;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de, *de1;
        struct super_block *sb;
@@ -1895,7 +1901,7 @@ static int empty_dir(struct inode *inode)
                                if (err)
                                        ext4_error(sb, __func__,
                                                   "error %d reading directory"
-                                                   " #%lu offset %lu",
+                                                   " #%lu offset %u",
                                                   err, inode->i_ino, offset);
                                offset += sb->s_blocksize;
                                continue;
@@ -1933,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        struct ext4_iloc iloc;
        int err = 0, rc;
+        if (!ext4_handle_valid(handle))
+                return 0;
        lock_super(sb);
        if (!list_empty(&EXT4_I(inode)->i_orphan))
                goto out_unlock;
@@ -1961,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        /* Insert this inode at the head of the on-disk orphan list... */
        NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
        EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-        err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
        rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
        if (!err)
                err = rc;
@@ -1995,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
        struct list_head *prev;
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi;
-        unsigned long ino_next;
+        __u32 ino_next;
        struct ext4_iloc iloc;
        int err = 0;
+        if (!ext4_handle_valid(handle))
+                return 0;
        lock_super(inode->i_sb);
        if (list_empty(&ei->i_orphan)) {
                unlock_super(inode->i_sb);
@@ -2017,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
         * transaction handle with which to update the orphan list on
         * disk, but we still need to remove the inode from the linked
         * list in memory. */
-        if (!handle)
+        if (sbi->s_journal && !handle)
                goto out;
        err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2025,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
                goto out_err;
        if (prev == &sbi->s_orphan) {
-                jbd_debug(4, "superblock will point to %lu\n", ino_next);
+                jbd_debug(4, "superblock will point to %u\n", ino_next);
                BUFFER_TRACE(sbi->s_sbh, "get_write_access");
                err = ext4_journal_get_write_access(handle, sbi->s_sbh);
                if (err)
                        goto out_brelse;
                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-                err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+                err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
        } else {
                struct ext4_iloc iloc2;
                struct inode *i_prev =
                        &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
-                jbd_debug(4, "orphan inode %lu will point to %lu\n",
+                jbd_debug(4, "orphan inode %lu will point to %u\n",
                          i_prev->i_ino, ino_next);
                err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
                if (err)
@@ -2082,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
                goto end_rmdir;
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = dentry->d_inode;
@@ -2136,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        retval = -ENOENT;
        bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2193,7 +2205,7 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
        err = PTR_ERR(inode);
@@ -2208,10 +2220,10 @@ retry:
                 * We have a transaction open.  All is sweetness.  It also sets
                 * i_size in generic_commit_write().
                 */
-                err = __page_symlink(inode, symname, l,
+                err = __page_symlink(inode, symname, l, 1);
-                                mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
                if (err) {
                        clear_nlink(inode);
+                        unlock_new_inode(inode);
                        ext4_mark_inode_dirty(handle, inode);
                        iput(inode);
                        goto out_stop;
@@ -2256,13 +2268,20 @@ retry:
                return PTR_ERR(handle);
        if (IS_DIRSYNC(dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        inode->i_ctime = ext4_current_time(inode);
        ext4_inc_count(handle, inode);
        atomic_inc(&inode->i_count);
-        err = ext4_add_nondir(handle, dentry, inode);
+        err = ext4_add_entry(handle, dentry, inode);
+        if (!err) {
+                ext4_mark_inode_dirty(handle, inode);
+                d_instantiate(dentry, inode);
+        } else {
+                drop_nlink(inode);
+                iput(inode);
+        }
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
                goto retry;
@@ -2298,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                return PTR_ERR(handle);
        if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-                handle->h_sync = 1;
+                ext4_handle_sync(handle);
        old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
        /*
@@ -2352,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_dir->i_ctime = new_dir->i_mtime =
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
-                BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-                ext4_journal_dirty_metadata(handle, new_bh);
+                ext4_handle_dirty_metadata(handle, new_dir, new_bh);
                brelse(new_bh);
                new_bh = NULL;
        }
@@ -2403,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                BUFFER_TRACE(dir_bh, "get_write_access");
                ext4_journal_get_write_access(handle, dir_bh);
                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
-                BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
+                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-                ext4_journal_dirty_metadata(handle, dir_bh);
+                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
                ext4_dec_count(handle, old_dir);
                if (new_inode) {
                        /* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6ec1843a015..c328be5d6885 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
        ext4_get_group_no_and_offset(sb, start, NULL, &offset);
        if (group != sbi->s_groups_count)
                ext4_warning(sb, __func__,
-                             "Cannot add at group %u (only %lu groups)",
+                             "Cannot add at group %u (only %u groups)",
                             input->group, sbi->s_groups_count);
        else if (offset != 0)
                        ext4_warning(sb, __func__, "Last group not full");
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
 {
        int err;
-        if (handle->h_buffer_credits >= thresh)
+        if (ext4_handle_has_enough_credits(handle, thresh))
                return 0;
        err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
                set_buffer_uptodate(gdb);
                unlock_buffer(gdb);
-                ext4_journal_dirty_metadata(handle, gdb);
+                ext4_handle_dirty_metadata(handle, NULL, gdb);
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(bh);
                        goto exit_bh;
                }
-                ext4_journal_dirty_metadata(handle, gdb);
+                ext4_handle_dirty_metadata(handle, NULL, gdb);
                ext4_set_bit(bit, bh->b_data);
                brelse(gdb);
        }
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
                        err = PTR_ERR(it);
                        goto exit_bh;
                }
-                ext4_journal_dirty_metadata(handle, it);
+                ext4_handle_dirty_metadata(handle, NULL, it);
                brelse(it);
                ext4_set_bit(bit, bh->b_data);
        }
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
        if ((err = extend_or_restart_transaction(handle, 2, bh)))
                goto exit_bh;
-        mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
+        mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
-                        bh->b_data);
+        ext4_handle_dirty_metadata(handle, NULL, bh);
-        ext4_journal_dirty_metadata(handle, bh);
        brelse(bh);
        /* Mark unused entries in inode bitmap used */
        ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
                   input->inode_bitmap, input->inode_bitmap - start);
@@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb,
                goto exit_journal;
        }
-        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+        mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
-        ext4_journal_dirty_metadata(handle, bh);
+        ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
        brelse(bh);
@@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         * reserved inode, and will become GDT blocks (primary and backup).
         */
        data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-        ext4_journal_dirty_metadata(handle, dind);
+        ext4_handle_dirty_metadata(handle, NULL, dind);
        brelse(dind);
        inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
        ext4_mark_iloc_dirty(handle, inode, &iloc);
        memset((*primary)->b_data, 0, sb->s_blocksize);
-        ext4_journal_dirty_metadata(handle, *primary);
+        ext4_handle_dirty_metadata(handle, NULL, *primary);
        o_group_desc = EXT4_SB(sb)->s_group_desc;
        memcpy(n_group_desc, o_group_desc,
@@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        kfree(o_group_desc);
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-        ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        return 0;
@@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
                       primary[i]->b_blocknr, gdbackups,
                       blk + primary[i]->b_blocknr); */
                data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
-                err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+                err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
                if (!err)
                        err = err2;
        }
@@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb,
                struct buffer_head *bh;
                /* Out of journal space, and can't get more - abort - so sad */
-                if (handle->h_buffer_credits == 0 &&
+                if (ext4_handle_valid(handle) &&
+                    handle->h_buffer_credits == 0 &&
                    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
                    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
                        break;
@@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb,
                        memset(bh->b_data + size, 0, rest);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
-                ext4_journal_dirty_metadata(handle, bh);
+                ext4_handle_dirty_metadata(handle, NULL, bh);
                brelse(bh);
        }
        if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb,
 exit_err:
        if (err) {
                ext4_warning(sb, __func__,
-                             "can't update backup for group %lu (err %d), "
+                             "can't update backup for group %u (err %d), "
                             "forcing fsck on next reboot", group, err);
                sbi->s_mount_state &= ~EXT4_VALID_FS;
                sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
@@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        struct inode *inode = NULL;
        handle_t *handle;
        int gdb_off, gdb_num;
+        int num_grp_locked = 0;
        int err, err2;
        gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if (ext4_blocks_count(es) + input->blocks_count <
            ext4_blocks_count(es)) {
-                ext4_warning(sb, __func__, "blocks_count overflow\n");
+                ext4_warning(sb, __func__, "blocks_count overflow");
                return -EINVAL;
        }
        if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
            le32_to_cpu(es->s_inodes_count)) {
-                ext4_warning(sb, __func__, "inodes_count overflow\n");
+                ext4_warning(sb, __func__, "inodes_count overflow");
                return -EINVAL;
        }
@@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                }
        }
        if ((err = verify_group_input(sb, input)))
                goto exit_put;
@@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         * using the new disk blocks.
         */
+        num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
        /* Update group descriptor block for new group */
        gdp = (struct ext4_group_desc *)((char *)primary->b_data +
                                         gdb_off * EXT4_DESC_SIZE(sb));
@@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
        ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
        ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-        gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
+        ext4_free_blks_set(sb, gdp, input->free_blocks_count);
-        gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+        ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
+        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
        /*
         * We can allocate memory for mb_alloc based on the new group
         * descriptor
         */
-        err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+        err = ext4_mb_add_groupinfo(sb, input->group, gdp);
-        if (err)
+        if (err) {
+                ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
                goto exit_journal;
+        }
        /*
         * Make the new blocks and inodes valid next.  We do this before
@@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
+        ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
-        ext4_journal_dirty_metadata(handle, primary);
+        ext4_handle_dirty_metadata(handle, NULL, primary);
        /* Update the reserved block counts only once the new group is
         * active. */
@@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                        EXT4_INODES_PER_GROUP(sb);
        }
-        ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+        ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
        sb->s_dirt = 1;
 exit_journal:
@@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        struct buffer_head *bh;
        handle_t *handle;
        int err;
-        unsigned long freed_blocks;
        ext4_group_t group;
-        struct ext4_group_info *grp;
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
@@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                        " too large to resize to %llu blocks safely\n",
                        sb->s_id, n_blocks_count);
                if (sizeof(sector_t) < 8)
-                        ext4_warning(sb, __func__,
+                        ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
-                        "CONFIG_LBD not enabled\n");
                return -EINVAL;
        }
@@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
-        ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        sb->s_dirt = 1;
        unlock_super(sb);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
-        ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+        /* We add the blocks to the bitmap and set the group need init bit */
+        ext4_add_groupblocks(handle, sb, o_blocks_count, add);
        ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        if ((err = ext4_journal_stop(handle)))
                goto exit_put;
-        /*
-         * Mark mballoc pages as not up to date so that they will be updated
-         * next time they are loaded by ext4_mb_load_buddy.
-         *
-         * XXX Bad, Bad, BAD!!!  We should not be overloading the
-         * Uptodate flag, particularly on thte bitmap bh, as way of
-         * hinting to ext4_mb_load_buddy() that it needs to be
-         * overloaded.  A user could take a LVM snapshot, then do an
-         * on-line fsck, and clear the uptodate flag, and this would
-         * not be a bug in userspace, but a bug in the kernel.  FIXME!!!
-         */
-        {
-                struct ext4_sb_info *sbi = EXT4_SB(sb);
-                struct inode *inode = sbi->s_buddy_cache;
-                int blocks_per_page;
-                int block;
-                int pnum;
-                struct page *page;
-                /* Set buddy page as not up to date */
-                blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-                block = group * 2;
-                pnum = block / blocks_per_page;
-                page = find_get_page(inode->i_mapping, pnum);
-                if (page != NULL) {
-                        ClearPageUptodate(page);
-                        page_cache_release(page);
-                }
-                /* Set bitmap page as not up to date */
-                block++;
-                pnum = block / blocks_per_page;
-                page = find_get_page(inode->i_mapping, pnum);
-                if (page != NULL) {
-                        ClearPageUptodate(page);
-                        page_cache_release(page);
-                }
-                /* Get the info on the last group */
-                grp = ext4_get_group_info(sb, group);
-                /* Update free blocks in group info */
-                ext4_mb_update_group_info(grp, add);
-        }
        if (test_opt(sb, DEBUG))
                printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
                       ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4a241c65dbe..e5f06a5f045e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,9 +51,7 @@ struct proc_dir_entry *ext4_proc_root;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
-static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
+static int ext4_commit_super(struct super_block *sb,
-                               unsigned int);
-static void ext4_commit_super(struct super_block *sb,
                              struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
@@ -64,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
-static void ext4_unlockfs(struct super_block *sb);
+static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
-static void ext4_write_super_lockfs(struct super_block *sb);
+static int ext4_freeze(struct super_block *sb);
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
+__u32 ext4_free_blks_count(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+{
+        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+__u32 ext4_free_inodes_count(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+{
+        return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+__u32 ext4_used_dirs_count(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+{
+        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+__u32 ext4_itable_unused_count(struct super_block *sb,
+                              struct ext4_group_desc *bg)
+{
+        return le16_to_cpu(bg->bg_itable_unused_lo) |
+                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+                (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
 void ext4_block_bitmap_set(struct super_block *sb,
                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
 {
@@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb,
                bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
+void ext4_free_blks_set(struct super_block *sb,
+                          struct ext4_group_desc *bg, __u32 count)
+{
+        bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+void ext4_free_inodes_set(struct super_block *sb,
+                          struct ext4_group_desc *bg, __u32 count)
+{
+        bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+void ext4_used_dirs_set(struct super_block *sb,
+                          struct ext4_group_desc *bg, __u32 count)
+{
+        bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+void ext4_itable_unused_set(struct super_block *sb,
+                          struct ext4_group_desc *bg, __u32 count)
+{
+        bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+        if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+                bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
 /*
 * Wrappers for jbd2_journal_start/end.
 *
@@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly. */
        journal = EXT4_SB(sb)->s_journal;
-        if (is_journal_aborted(journal)) {
+        if (journal) {
-                ext4_abort(sb, __func__,
+                if (is_journal_aborted(journal)) {
-                           "Detected aborted journal");
+                        ext4_abort(sb, __func__,
-                return ERR_PTR(-EROFS);
+                                   "Detected aborted journal");
+                        return ERR_PTR(-EROFS);
+                }
+                return jbd2_journal_start(journal, nblocks);
        }
+        /*
-        return jbd2_journal_start(journal, nblocks);
+         * We're not journaling, return the appropriate indication.
+         */
+        current->journal_info = EXT4_NOJOURNAL_HANDLE;
+        return current->journal_info;
 }
 /*
@@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
        int err;
        int rc;
+        if (!ext4_handle_valid(handle)) {
+                /*
+                 * Do this here since we don't call jbd2_journal_stop() in
+                 * no-journal mode.
+                 */
+                current->journal_info = NULL;
+                return 0;
+        }
        sb = handle->h_transaction->t_journal->j_private;
        err = handle->h_err;
        rc = jbd2_journal_stop(handle);
@@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
        char nbuf[16];
        const char *errstr = ext4_decode_error(NULL, err, nbuf);
+        BUG_ON(!ext4_handle_valid(handle));
        if (bh)
                BUFFER_TRACE(bh, "abort");
@@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function,
        va_end(args);
 }
+void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
+                                const char *function, const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+        va_list args;
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+        va_start(args, fmt);
+        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+        if (test_opt(sb, ERRORS_CONT)) {
+                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+                ext4_commit_super(sb, es, 0);
+                return;
+        }
+        ext4_unlock_group(sb, grp);
+        ext4_handle_error(sb);
+        /*
+         * We only get here in the ERRORS_RO case; relocking the group
+         * may be dangerous, but nothing bad will happen since the
+         * filesystem will have already been marked read/only and the
+         * journal has been aborted.  We return 1 as a hint to callers
+         * who might what to use the return value from
+         * ext4_grp_locked_error() to distinguish beween the
+         * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+         * aggressively from the ext4 function in question, with a
+         * more appropriate error code.
+         */
+        ext4_lock_group(sb, grp);
+        return;
+}
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
        return bdev;
 fail:
-        printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+        printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
                        __bdevname(dev, b), PTR_ERR(bdev));
        return NULL;
 }
@@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb)
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
-        err = jbd2_journal_destroy(sbi->s_journal);
+        if (sbi->s_journal) {
-        sbi->s_journal = NULL;
+                err = jbd2_journal_destroy(sbi->s_journal);
-        if (err < 0)
+                sbi->s_journal = NULL;
-                ext4_abort(sb, __func__, "Couldn't clean up the journal");
+                if (err < 0)
+                        ext4_abort(sb, __func__,
+                                   "Couldn't clean up the journal");
+        }
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
        INIT_LIST_HEAD(&ei->i_prealloc_list);
        spin_lock_init(&ei->i_prealloc_lock);
+        /*
+         * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
+         * therefore it can be null here.  Don't check it, just initialize
+         * jinode.
+         */
        jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
@@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode)
        }
 #endif
        ext4_discard_preallocations(inode);
-        jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+        if (EXT4_JOURNAL(inode))
+                jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
                                       &EXT4_I(inode)->jinode);
 }
@@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
        if (!test_opt(sb, RESERVATION))
                seq_puts(seq, ",noreservation");
-        if (sbi->s_commit_interval) {
+        if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
        }
+        if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+                seq_printf(seq, ",min_batch_time=%u",
+                           (unsigned) sbi->s_min_batch_time);
+        }
+        if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+                seq_printf(seq, ",max_batch_time=%u",
+                           (unsigned) sbi->s_min_batch_time);
+        }
        /*
         * We're changing the default of barrier mount option, so
         * let's always display its mount state so it's clear what its
@@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",journal_async_commit");
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
-        if (!test_opt(sb, EXTENTS))
-                seq_puts(seq, ",noextents");
        if (test_opt(sb, I_VERSION))
                seq_puts(seq, ",i_version");
        if (!test_opt(sb, DELALLOC))
@@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
                                    ext4_nfs_get_inode);
 }
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+{
+        journal_t *journal = EXT4_SB(sb)->s_journal;
+        WARN_ON(PageChecked(page));
+        if (!page_has_buffers(page))
+                return 0;
+        if (journal)
+                return jbd2_journal_try_to_free_buffers(journal, page,
+                                                        wait & ~__GFP_WAIT);
+        return try_to_free_buffers(page);
+}
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -803,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = {
        .acquire_dquot  = ext4_acquire_dquot,
        .release_dquot  = ext4_release_dquot,
        .mark_dirty     = ext4_mark_dquot_dirty,
-        .write_info     = ext4_write_info
+        .write_info     = ext4_write_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
 static struct quotactl_ops ext4_qctl_operations = {
@@ -826,8 +978,8 @@ static const struct super_operations ext4_sops = {
        .put_super      = ext4_put_super,
        .write_super    = ext4_write_super,
        .sync_fs        = ext4_sync_fs,
-        .write_super_lockfs = ext4_write_super_lockfs,
+        .freeze_fs      = ext4_freeze,
-        .unlockfs       = ext4_unlockfs,
+        .unfreeze_fs    = ext4_unfreeze,
        .statfs         = ext4_statfs,
        .remount_fs     = ext4_remount,
        .clear_inode    = ext4_clear_inode,
@@ -836,6 +988,7 @@ static const struct super_operations ext4_sops = {
        .quota_read     = ext4_quota_read,
        .quota_write    = ext4_quota_write,
 #endif
+        .bdev_try_to_free_page = bdev_try_to_free_page,
 };
 static const struct export_operations ext4_export_ops = {
@@ -850,16 +1003,17 @@ enum {
        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+        Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+        Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
-        Opt_inode_readahead_blks
+        Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 static const match_table_t tokens = {
@@ -889,8 +1043,9 @@ static const match_table_t tokens = {
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
        {Opt_commit, "commit=%u"},
+        {Opt_min_batch_time, "min_batch_time=%u"},
+        {Opt_max_batch_time, "max_batch_time=%u"},
        {Opt_journal_update, "journal=update"},
-        {Opt_journal_inum, "journal=%u"},
        {Opt_journal_dev, "journal_dev=%u"},
        {Opt_journal_checksum, "journal_checksum"},
        {Opt_journal_async_commit, "journal_async_commit"},
@@ -911,14 +1066,13 @@ static const match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
-        {Opt_extents, "extents"},
-        {Opt_noextents, "noextents"},
        {Opt_i_version, "i_version"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+        {Opt_journal_ioprio, "journal_ioprio=%u"},
        {Opt_err, NULL},
 };
@@ -943,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data)
        return sb_block;
 }
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
 static int parse_options(char *options, struct super_block *sb,
-                         unsigned int *inum, unsigned long *journal_devnum,
+                         unsigned long *journal_devnum,
+                         unsigned int *journal_ioprio,
                         ext4_fsblk_t *n_blocks_count, int is_remount)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -956,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb,
        int qtype, qfmt;
        char *qname;
 #endif
-        ext4_fsblk_t last_block;
        if (!options)
                return 1;
@@ -1068,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
                        }
                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
                        break;
-                case Opt_journal_inum:
-                        if (is_remount) {
-                                printk(KERN_ERR "EXT4-fs: cannot specify "
-                                       "journal on remount\n");
-                                return 0;
-                        }
-                        if (match_int(&args[0], &option))
-                                return 0;
-                        *inum = option;
-                        break;
                case Opt_journal_dev:
                        if (is_remount) {
                                printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -1107,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb,
                                option = JBD2_DEFAULT_MAX_COMMIT_AGE;
                        sbi->s_commit_interval = HZ * option;
                        break;
+                case Opt_max_batch_time:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0)
+                                return 0;
+                        if (option == 0)
+                                option = EXT4_DEF_MAX_BATCH_TIME;
+                        sbi->s_max_batch_time = option;
+                        break;
+                case Opt_min_batch_time:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_min_batch_time = option;
+                        break;
                case Opt_data_journal:
                        data_opt = EXT4_MOUNT_JOURNAL_DATA;
                        goto datacheck;
@@ -1142,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_grpjquota:
                        qtype = GRPQUOTA;
 set_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            !sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR
                                       "EXT4-fs: Cannot change journaled "
@@ -1182,8 +1343,7 @@ set_qf_name:
                case Opt_offgrpjquota:
                        qtype = GRPQUOTA;
 clear_qf_name:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_qf_names[qtype]) {
                                printk(KERN_ERR "EXT4-fs: Cannot change "
                                        "journaled quota options when "
@@ -1202,8 +1362,7 @@ clear_qf_name:
                case Opt_jqfmt_vfsv0:
                        qfmt = QFMT_VFS_V0;
 set_qf_format:
-                        if ((sb_any_quota_enabled(sb) ||
+                        if (sb_any_quota_loaded(sb) &&
-                             sb_any_quota_suspended(sb)) &&
                            sbi->s_jquota_fmt != qfmt) {
                                printk(KERN_ERR "EXT4-fs: Cannot change "
                                        "journaled quota options when "
@@ -1222,7 +1381,7 @@ set_qf_format:
                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
-                        if (sb_any_quota_enabled(sb)) {
+                        if (sb_any_quota_loaded(sb)) {
                                printk(KERN_ERR "EXT4-fs: Cannot change quota "
                                        "options when quota turned on.\n");
                                return 0;
@@ -1280,33 +1439,6 @@ set_qf_format:
                case Opt_bh:
                        clear_opt(sbi->s_mount_opt, NOBH);
                        break;
-                case Opt_extents:
-                        if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
-                                        EXT4_FEATURE_INCOMPAT_EXTENTS)) {
-                                ext4_warning(sb, __func__,
-                                        "extents feature not enabled "
-                                        "on this filesystem, use tune2fs\n");
-                                return 0;
-                        }
-                        set_opt(sbi->s_mount_opt, EXTENTS);
-                        break;
-                case Opt_noextents:
-                        /*
-                         * When e2fsprogs support resizing an already existing
-                         * ext3 file system to greater than 2**32 we need to
-                         * add support to block allocator to handle growing
-                         * already existing block  mapped inode so that blocks
-                         * allocated for them fall within 2**32
-                         */
-                        last_block = ext4_blocks_count(sbi->s_es) - 1;
-                        if (last_block  > 0xffffffffULL) {
-                                printk(KERN_ERR "EXT4-fs: Filesystem too "
-                                                "large to mount with "
-                                                "-o noextents options\n");
-                                return 0;
-                        }
-                        clear_opt(sbi->s_mount_opt, EXTENTS);
-                        break;
                case Opt_i_version:
                        set_opt(sbi->s_mount_opt, I_VERSION);
                        sb->s_flags |= MS_I_VERSION;
@@ -1331,6 +1463,14 @@ set_qf_format:
                                return 0;
                        sbi->s_inode_readahead_blks = option;
                        break;
+                case Opt_journal_ioprio:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0 || option > 7)
+                                break;
+                        *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
+                                                            option);
+                        break;
                default:
                        printk(KERN_ERR
                               "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1406,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                printk(KERN_WARNING
                       "EXT4-fs warning: checktime reached, "
                       "running e2fsck is recommended\n");
-#if 0
+        if (!sbi->s_journal) 
-                /* @@@ We _will_ want to clear the valid bit if we find
+                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
-                 * inconsistencies, to force a fsck at reboot.  But for
-                 * a plain journaled filesystem we can keep it set as
-                 * valid forever! :)
-                 */
-        es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
-#endif
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
        le16_add_cpu(&es->s_mnt_count, 1);
        es->s_mtime = cpu_to_le32(get_seconds());
        ext4_update_dynamic_rev(sb);
-        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        if (sbi->s_journal)
+                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        ext4_commit_super(sb, es, 1);
        if (test_opt(sb, DEBUG))
-                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
+                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
                        sb->s_blocksize,
                        sbi->s_groups_count,
@@ -1431,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        EXT4_INODES_PER_GROUP(sb),
                        sbi->s_mount_opt);
-        printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+        if (EXT4_SB(sb)->s_journal) {
-               sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+                printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-               "external", EXT4_SB(sb)->s_journal->j_devname);
+                       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+                       "external", EXT4_SB(sb)->s_journal->j_devname);
+        } else {
+                printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+        }
        return res;
 }
@@ -1445,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
        ext4_group_t flex_group_count;
        ext4_group_t flex_group;
        int groups_per_flex = 0;
-        __u64 block_bitmap = 0;
        int i;
        if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1464,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb)
                                     sizeof(struct flex_groups), GFP_KERNEL);
        if (sbi->s_flex_groups == NULL) {
                printk(KERN_ERR "EXT4-fs: not enough memory for "
-                                "%lu flex groups\n", flex_group_count);
+                                "%u flex groups\n", flex_group_count);
                goto failed;
        }
-        gdp = ext4_get_group_desc(sb, 1, &bh);
-        block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
        for (i = 0; i < sbi->s_groups_count; i++) {
                gdp = ext4_get_group_desc(sb, i, &bh);
                flex_group = ext4_flex_group(sbi, i);
                sbi->s_flex_groups[flex_group].free_inodes +=
-                        le16_to_cpu(gdp->bg_free_inodes_count);
+                        ext4_free_inodes_count(sb, gdp);
                sbi->s_flex_groups[flex_group].free_blocks +=
-                        le16_to_cpu(gdp->bg_free_blocks_count);
+                        ext4_free_blks_count(sb, gdp);
        }
        return 1;
@@ -1552,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb)
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                               "Block bitmap for group %lu not in group "
+                               "Block bitmap for group %u not in group "
                               "(block %llu)!\n", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                               "Inode bitmap for group %lu not in group "
+                               "Inode bitmap for group %u not in group "
                               "(block %llu)!\n", i, inode_bitmap);
                        return 0;
                }
@@ -1567,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb)
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                               "Inode table for group %lu not in group "
+                               "Inode table for group %u not in group "
                               "(block %llu)!\n", i, inode_table);
                        return 0;
                }
                spin_lock(sb_bgl_lock(sbi, i));
                if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                               "Checksum for group %lu failed (%u!=%u)\n",
+                               "Checksum for group %u failed (%u!=%u)\n",
                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
                               gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!(sb->s_flags & MS_RDONLY)) {
@@ -1721,7 +1856,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
        /* small i_blocks in vfs inode? */
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * CONFIG_LSF is not enabled implies the inode
+                 * CONFIG_LBD is not enabled implies the inode
                 * i_block represent total blocks in 512 bytes
                 * 32 == size of vfs inode i_blocks * 8
                 */
@@ -1764,7 +1899,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * !has_huge_files or CONFIG_LSF is not enabled
+                 * !has_huge_files or CONFIG_LBD is not enabled
                 * implies the inode i_block represent total blocks in
                 * 512 bytes 32 == size of vfs inode i_blocks * 8
                 */
@@ -1866,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        ext4_fsblk_t sb_block = get_sb_block(&data);
        ext4_fsblk_t logical_sb_block;
        unsigned long offset = 0;
-        unsigned int journal_inum = 0;
        unsigned long journal_devnum = 0;
        unsigned long def_mount_opts;
        struct inode *root;
        char *cp;
+        const char *descr;
        int ret = -EINVAL;
        int blocksize;
-        int db_count;
+        unsigned int db_count;
-        int i;
+        unsigned int i;
        int needs_recovery, has_huge_files;
-        __le32 features;
+        int features;
        __u64 blocks_count;
        int err;
+        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
@@ -1959,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
        sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
        set_opt(sbi->s_mount_opt, RESERVATION);
        set_opt(sbi->s_mount_opt, BARRIER);
        /*
-         * turn on extents feature by default in ext4 filesystem
-         * only if feature flag already set by mkfs or tune2fs.
-         * Use -o noextents to turn it off
-         */
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
-                set_opt(sbi->s_mount_opt, EXTENTS);
-        else
-                ext4_warning(sb, __func__,
-                        "extents feature not enabled on this filesystem, "
-                        "use tune2fs.\n");
-        /*
         * enable delayed allocation by default
         * Use -o nodelalloc to turn it off
         */
        set_opt(sbi->s_mount_opt, DELALLOC);
-        if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
+        if (!parse_options((char *) data, sb, &journal_devnum,
-                           NULL, 0))
+                           &journal_ioprio, NULL, 0))
                goto failed_mount;
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2005,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
        if (features) {
                printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-                       "unsupported optional features (%x).\n",
+                       "unsupported optional features (%x).\n", sb->s_id,
-                       sb->s_id, le32_to_cpu(features));
+                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                goto failed_mount;
        }
        features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
        if (!(sb->s_flags & MS_RDONLY) && features) {
                printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-                       "unsupported optional features (%x).\n",
+                       "unsupported optional features (%x).\n", sb->s_id,
-                       sb->s_id, le32_to_cpu(features));
+                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                goto failed_mount;
        }
        has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -2021,13 +2150,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (has_huge_files) {
                /*
                 * Large file size enabled file system can only be
-                 * mount if kernel is build with CONFIG_LSF
+                 * mount if kernel is build with CONFIG_LBD
                 */
                if (sizeof(root->i_blocks) < sizeof(u64) &&
                                !(sb->s_flags & MS_RDONLY)) {
                        printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
                                        "files cannot be mounted read-write "
-                                        "without CONFIG_LSF.\n", sb->s_id);
+                                        "without CONFIG_LBD.\n", sb->s_id);
                        goto failed_mount;
                }
        }
@@ -2118,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
+        i = le32_to_cpu(es->s_flags);
+        if (i & EXT2_FLAGS_UNSIGNED_HASH)
+                sbi->s_hash_unsigned = 3;
+        else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+                es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+                sbi->s_hash_unsigned = 3;
+#else
+                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+                sb->s_dirt = 1;
+        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
                printk(KERN_ERR
@@ -2145,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
-        /* ensure blocks_count calculation below doesn't sign-extend */
+        /*
-        if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
+         * It makes no sense for the first data block to be beyond the end
-            le32_to_cpu(es->s_first_data_block) + 1) {
+         * of the filesystem.
-                printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
+         */
-                       "first data block %u, blocks per group %lu\n",
+        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                        ext4_blocks_count(es),
+                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
-                        le32_to_cpu(es->s_first_data_block),
+                       "block %u is beyond end of filesystem (%llu)\n",
-                        EXT4_BLOCKS_PER_GROUP(sb));
+                       le32_to_cpu(es->s_first_data_block),
+                       ext4_blocks_count(es));
                goto failed_mount;
        }
        blocks_count = (ext4_blocks_count(es) -
                        le32_to_cpu(es->s_first_data_block) +
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+                printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+                       "(block count %llu, first data block %u, "
+                       "blocks per group %lu)\n", sbi->s_groups_count,
+                       ext4_blocks_count(es),
+                       le32_to_cpu(es->s_first_data_block),
+                       EXT4_BLOCKS_PER_GROUP(sb));
+                goto failed_mount;
+        }
        sbi->s_groups_count = blocks_count;
        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
@@ -2270,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
                                ext4_commit_super(sb, es, 1);
-                                printk(KERN_CRIT
-                                       "EXT4-fs (device %s): mount failed\n",
-                                      sb->s_id);
                                goto failed_mount4;
                        }
                }
-        } else if (journal_inum) {
+        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
-                if (ext4_create_journal(sb, es, journal_inum))
+              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
-                        goto failed_mount3;
+                printk(KERN_ERR "EXT4-fs: required journal recovery "
+                       "suppressed and not mounted read-only\n");
+                goto failed_mount4;
        } else {
-                if (!silent)
+                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
-                        printk(KERN_ERR
+                set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
-                               "ext4: No journal on filesystem on %s\n",
+                sbi->s_journal = NULL;
-                               sb->s_id);
+                needs_recovery = 0;
-                goto failed_mount3;
+                goto no_journal;
        }
        if (ext4_blocks_count(es) > 0xffffffffULL &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
-                printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
+                printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
                goto failed_mount4;
        }
@@ -2335,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        default:
                break;
        }
+        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+no_journal:
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
@@ -2420,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
-        if (needs_recovery)
+        if (needs_recovery) {
                printk(KERN_INFO "EXT4-fs: recovery complete.\n");
-        ext4_mark_recovery_complete(sb, es);
+                ext4_mark_recovery_complete(sb, es);
-        printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
+        }
-               test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
+        if (EXT4_SB(sb)->s_journal) {
-               test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
+                if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-               "writeback");
+                        descr = " journalled data mode";
+                else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+                        descr = " ordered data mode";
+                else
+                        descr = " writeback data mode";
+        } else
+                descr = "out journal";
+        printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
+               sb->s_id, descr);
        lock_kernel();
        return 0;
@@ -2438,8 +2600,11 @@ cantfind_ext4:
        goto failed_mount;
 failed_mount4:
-        jbd2_journal_destroy(sbi->s_journal);
+        printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
-        sbi->s_journal = NULL;
+        if (sbi->s_journal) {
+                jbd2_journal_destroy(sbi->s_journal);
+                sbi->s_journal = NULL;
+        }
 failed_mount3:
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2476,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (sbi->s_commit_interval)
+        journal->j_commit_interval = sbi->s_commit_interval;
-                journal->j_commit_interval = sbi->s_commit_interval;
+        journal->j_min_batch_time = sbi->s_min_batch_time;
-        /* We could also set up an ext4-specific default for the commit
+        journal->j_max_batch_time = sbi->s_max_batch_time;
-         * interval here, but for now we'll just fall back to the jbd
-         * default. */
        spin_lock(&journal->j_state_lock);
        if (test_opt(sb, BARRIER))
@@ -2500,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
        struct inode *journal_inode;
        journal_t *journal;
+        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
        /* First, test for the existence of a valid inode on disk.  Bad
         * things happen if we iget() an unused inode, as the subsequent
         * iput() will try to delete it. */
@@ -2548,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        struct ext4_super_block *es;
        struct block_device *bdev;
+        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
        bdev = ext4_blkdev_get(j_dev);
        if (bdev == NULL)
                return NULL;
        if (bd_claim(bdev, sb)) {
                printk(KERN_ERR
-                        "EXT4: failed to claim external journal device.\n");
+                        "EXT4-fs: failed to claim external journal device.\n");
                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
                return NULL;
        }
@@ -2635,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb,
        int err = 0;
        int really_read_only;
+        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2719,55 +2888,14 @@ static int ext4_load_journal(struct super_block *sb,
        return 0;
 }
-static int ext4_create_journal(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb,
-                               struct ext4_super_block *es,
-                               unsigned int journal_inum)
-{
-        journal_t *journal;
-        int err;
-        if (sb->s_flags & MS_RDONLY) {
-                printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
-                                "create journal.\n");
-                return -EROFS;
-        }
-        journal = ext4_get_journal(sb, journal_inum);
-        if (!journal)
-                return -EINVAL;
-        printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
-               journal_inum);
-        err = jbd2_journal_create(journal);
-        if (err) {
-                printk(KERN_ERR "EXT4-fs: error creating journal.\n");
-                jbd2_journal_destroy(journal);
-                return -EIO;
-        }
-        EXT4_SB(sb)->s_journal = journal;
-        ext4_update_dynamic_rev(sb);
-        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-        EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
-        es->s_journal_inum = cpu_to_le32(journal_inum);
-        sb->s_dirt = 1;
-        /* Make sure we flush the recovery flag to disk. */
-        ext4_commit_super(sb, es, 1);
-        return 0;
-}
-static void ext4_commit_super(struct super_block *sb,
                              struct ext4_super_block *es, int sync)
 {
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+        int error = 0;
        if (!sbh)
-                return;
+                return error;
        if (buffer_write_io_error(sbh)) {
                /*
                 * Oh, dear.  A previous attempt to write the
@@ -2777,25 +2905,33 @@ static void ext4_commit_super(struct super_block *sb,
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-                printk(KERN_ERR "ext4: previous I/O error to "
+                printk(KERN_ERR "EXT4-fs: previous I/O error to "
                       "superblock detected for %s.\n", sb->s_id);
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
        es->s_wtime = cpu_to_le32(get_seconds());
-        ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
+        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-        es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+                                        &EXT4_SB(sb)->s_freeblocks_counter));
+        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+                                        &EXT4_SB(sb)->s_freeinodes_counter));
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
        if (sync) {
-                sync_dirty_buffer(sbh);
+                error = sync_dirty_buffer(sbh);
-                if (buffer_write_io_error(sbh)) {
+                if (error)
-                        printk(KERN_ERR "ext4: I/O error while writing "
+                        return error;
+                error = buffer_write_io_error(sbh);
+                if (error) {
+                        printk(KERN_ERR "EXT4-fs: I/O error while writing "
                               "superblock for %s.\n", sb->s_id);
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }
        }
+        return error;
 }
@@ -2809,6 +2945,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 {
        journal_t *journal = EXT4_SB(sb)->s_journal;
+        if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+                BUG_ON(journal != NULL);
+                return;
+        }
        jbd2_journal_lock_updates(journal);
        if (jbd2_journal_flush(journal) < 0)
                goto out;
@@ -2838,6 +2978,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
        int j_errno;
        const char *errstr;
+        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
        journal = EXT4_SB(sb)->s_journal;
        /*
@@ -2870,14 +3012,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
 int ext4_force_commit(struct super_block *sb)
 {
        journal_t *journal;
-        int ret;
+        int ret = 0;
        if (sb->s_flags & MS_RDONLY)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        sb->s_dirt = 0;
+        if (journal) {
-        ret = ext4_journal_force_commit(journal);
+                sb->s_dirt = 0;
+                ret = ext4_journal_force_commit(journal);
+        }
        return ret;
 }
@@ -2889,9 +3034,13 @@ int ext4_force_commit(struct super_block *sb)
 */
 static void ext4_write_super(struct super_block *sb)
 {
-        if (mutex_trylock(&sb->s_lock) != 0)
+        if (EXT4_SB(sb)->s_journal) {
-                BUG();
+                if (mutex_trylock(&sb->s_lock) != 0)
-        sb->s_dirt = 0;
+                        BUG();
+                sb->s_dirt = 0;
+        } else {
+                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+        }
 }
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2900,10 +3049,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
        sb->s_dirt = 0;
-        if (wait)
+        if (EXT4_SB(sb)->s_journal) {
-                ret = ext4_force_commit(sb);
+                if (wait)
-        else
+                        ret = ext4_force_commit(sb);
-                jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+                else
+                        jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+        } else {
+                ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+        }
        return ret;
 }
@@ -2911,36 +3064,48 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 * LVM calls this function before a (read-only) snapshot is created.  This
 * gives us a chance to flush the journal completely and mark the fs clean.
 */
-static void ext4_write_super_lockfs(struct super_block *sb)
+static int ext4_freeze(struct super_block *sb)
 {
+        int error = 0;
+        journal_t *journal;
        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY)) {
-                journal_t *journal = EXT4_SB(sb)->s_journal;
+                journal = EXT4_SB(sb)->s_journal;
-                /* Now we set up the journal barrier. */
+                if (journal) {
-                jbd2_journal_lock_updates(journal);
+                        /* Now we set up the journal barrier. */
+                        jbd2_journal_lock_updates(journal);
-                /*
+                        /*
-                 * We don't want to clear needs_recovery flag when we failed
+                         * We don't want to clear needs_recovery flag when we
-                 * to flush the journal.
+                         * failed to flush the journal.
-                 */
+                         */
-                if (jbd2_journal_flush(journal) < 0)
+                        error = jbd2_journal_flush(journal);
-                        return;
+                        if (error < 0)
+                                goto out;
+                }
                /* Journal blocked and flushed, clear needs_recovery flag. */
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+                error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+                if (error)
+                        goto out;
        }
+        return 0;
+out:
+        jbd2_journal_unlock_updates(journal);
+        return error;
 }
 /*
 * Called by LVM after the snapshot is done.  We need to reset the RECOVER
 * flag here, even though the filesystem is not technically dirty yet.
 */
-static void ext4_unlockfs(struct super_block *sb)
+static int ext4_unfreeze(struct super_block *sb)
 {
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
                lock_super(sb);
                /* Reser the needs_recovery flag before the fs is unlocked. */
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2948,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb)
                unlock_super(sb);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        }
+        return 0;
 }
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
@@ -2958,6 +3124,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
        ext4_group_t g;
+        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
@@ -2969,16 +3136,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        old_opts.s_resuid = sbi->s_resuid;
        old_opts.s_resgid = sbi->s_resgid;
        old_opts.s_commit_interval = sbi->s_commit_interval;
+        old_opts.s_min_batch_time = sbi->s_min_batch_time;
+        old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
        old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
        for (i = 0; i < MAXQUOTAS; i++)
                old_opts.s_qf_names[i] = sbi->s_qf_names[i];
 #endif
+        if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+                journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
        /*
         * Allow the "check" option to be passed as a remount option.
         */
-        if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+        if (!parse_options(data, sb, NULL, &journal_ioprio,
+                           &n_blocks_count, 1)) {
                err = -EINVAL;
                goto restore_opts;
        }
@@ -2991,7 +3163,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        es = sbi->s_es;
-        ext4_init_journal_params(sb, sbi->s_journal);
+        if (sbi->s_journal) {
+                ext4_init_journal_params(sb, sbi->s_journal);
+                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+        }
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
                n_blocks_count > ext4_blocks_count(es)) {
@@ -3020,17 +3195,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * We have to unlock super so that we can wait for
                         * transactions.
                         */
-                        unlock_super(sb);
+                        if (sbi->s_journal) {
-                        ext4_mark_recovery_complete(sb, es);
+                                unlock_super(sb);
-                        lock_super(sb);
+                                ext4_mark_recovery_complete(sb, es);
+                                lock_super(sb);
+                        }
                } else {
-                        __le32 ret;
+                        int ret;
                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
                                       "remount RDWR because of unsupported "
-                                       "optional features (%x).\n",
+                                       "optional features (%x).\n", sb->s_id,
-                                       sb->s_id, le32_to_cpu(ret));
+                                (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
+                                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                                err = -EROFS;
                                goto restore_opts;
                        }
@@ -3047,7 +3225,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
                                        printk(KERN_ERR
               "EXT4-fs: ext4_remount: "
-                "Checksum for group %lu failed (%u!=%u)\n",
+                "Checksum for group %u failed (%u!=%u)\n",
                g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EINVAL;
@@ -3076,7 +3254,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * been changed by e2fsck since we originally mounted
                         * the partition.)
                         */
-                        ext4_clear_journal_err(sb, es);
+                        if (sbi->s_journal)
+                                ext4_clear_journal_err(sb, es);
                        sbi->s_mount_state = le16_to_cpu(es->s_state);
                        if ((err = ext4_group_extend(sb, es, n_blocks_count)))
                                goto restore_opts;
@@ -3084,6 +3263,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                sb->s_flags &= ~MS_RDONLY;
                }
        }
+        if (sbi->s_journal == NULL)
+                ext4_commit_super(sb, es, 1);
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
        for (i = 0; i < MAXQUOTAS; i++)
@@ -3098,6 +3280,8 @@ restore_opts:
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sbi->s_commit_interval = old_opts.s_commit_interval;
+        sbi->s_min_batch_time = old_opts.s_min_batch_time;
+        sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
        sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
        for (i = 0; i < MAXQUOTAS; i++) {
@@ -3360,7 +3544,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
         * When we journal data on quota file, we have to flush journal to see
         * all updates to the file when we bypass pagecache...
         */
-        if (ext4_should_journal_data(path.dentry->d_inode)) {
+        if (EXT4_SB(sb)->s_journal &&
+            ext4_should_journal_data(path.dentry->d_inode)) {
                /*
                 * We don't need to lock updates but journal_flush() could
                 * otherwise be livelocked...
@@ -3434,7 +3619,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
-        if (!handle) {
+        if (EXT4_SB(sb)->s_journal && !handle) {
                printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
                        " cancelled because transaction is not started.\n",
                        (unsigned long long)off, (unsigned long long)len);
@@ -3459,7 +3644,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
                flush_dcache_page(bh->b_page);
                unlock_buffer(bh);
                if (journal_quota)
-                        err = ext4_journal_dirty_metadata(handle, bh);
+                        err = ext4_handle_dirty_metadata(handle, NULL, bh);
                else {
                        /* Always do at least ordered writes for quotas */
                        err = ext4_jbd2_file_inode(handle, inode);
@@ -3513,18 +3698,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
 static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
                               size_t cnt, loff_t *ppos)
 {
-        unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+        unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
        char str[32];
-        unsigned long value;
        if (cnt >= sizeof(str))
                return -EINVAL;
        if (copy_from_user(str, buf, cnt))
                return -EFAULT;
-        value = simple_strtol(str, NULL, 0);
-        if (value < 0)
+        *p = simple_strtoul(str, NULL, 0);
-                return -ERANGE;
-        *p = value;
        return cnt;
 }
@@ -3615,7 +3797,7 @@ static void __exit exit_ext4_fs(void)
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
 module_init(init_ext4_fs)
 module_exit(exit_ext4_fs)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 80626d516fee..157ce6589c54 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
        if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
                EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
                sb->s_dirt = 1;
-                ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+                ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        }
 }
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
        } else {
                le32_add_cpu(&BHDR(bh)->h_refcount, -1);
-                error = ext4_journal_dirty_metadata(handle, bh);
+                error = ext4_handle_dirty_metadata(handle, inode, bh);
                if (IS_SYNC(inode))
-                        handle->h_sync = 1;
+                        ext4_handle_sync(handle);
                DQUOT_FREE_BLOCK(inode, 1);
                ea_bdebug(bh, "refcount now=%d; releasing",
                          le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                        if (error == -EIO)
                                goto bad_block;
                        if (!error)
-                                error = ext4_journal_dirty_metadata(handle,
+                                error = ext4_handle_dirty_metadata(handle,
-                                                                    bs->bh);
+                                                                   inode,
+                                                                   bs->bh);
                        if (error)
                                goto cleanup;
                        goto inserted;
@@ -794,8 +795,9 @@ inserted:
                                ea_bdebug(new_bh, "reusing; refcount now=%d",
                                        le32_to_cpu(BHDR(new_bh)->h_refcount));
                                unlock_buffer(new_bh);
-                                error = ext4_journal_dirty_metadata(handle,
+                                error = ext4_handle_dirty_metadata(handle,
-                                                                    new_bh);
+                                                                   inode,
+                                                                   new_bh);
                                if (error)
                                        goto cleanup_dquot;
                        }
@@ -810,8 +812,8 @@ inserted:
                        /* We need to allocate a new block */
                        ext4_fsblk_t goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
-                        ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
+                        ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
-                                                        goal, &error);
+                                                  goal, NULL, &error);
                        if (error)
                                goto cleanup;
                        ea_idebug(inode, "creating block %d", block);
@@ -833,7 +835,8 @@ getblk_failed:
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
                        ext4_xattr_cache_insert(new_bh);
-                        error = ext4_journal_dirty_metadata(handle, new_bh);
+                        error = ext4_handle_dirty_metadata(handle,
+                                                           inode, new_bh);
                        if (error)
                                goto cleanup;
                }
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
                 */
                is.iloc.bh = NULL;
                if (IS_SYNC(inode))
-                        handle->h_sync = 1;
+                        ext4_handle_sync(handle);
        }
 cleanup:
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 67e058357098..3a7f603b6982 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -841,7 +841,6 @@ const struct file_operations fat_dir_operations = {
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
        .fsync          = file_fsync,
-        .llseek         = generic_file_llseek,
 };
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d937aaf77374..6b74d09adbe5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -749,6 +749,8 @@ static struct dentry *fat_get_parent(struct dentry *child)
        brelse(bh);
        parent = d_obtain_alias(inode);
+        if (!IS_ERR(parent))
+                parent->d_op = sb->s_root->d_op;
 out:
        unlock_super(sb);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bf326d4356a3..8ae32e37673c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -78,7 +78,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
         * for creation.
         */
        if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
-                if (nd->flags & LOOKUP_CREATE)
+                if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
                        return 0;
        }
diff --git a/fs/file_table.c b/fs/file_table.c
index 0fbcacc3ea75..bbeeac6efa1a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -32,6 +32,9 @@ struct files_stat_struct files_stat = {
 /* public. Not pretty! */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+/* SLAB cache for file structures */
+static struct kmem_cache *filp_cachep __read_mostly;
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 static inline void file_free_rcu(struct rcu_head *head)
@@ -397,7 +400,12 @@ too_bad:
 void __init files_init(unsigned long mempages)
 { 
        int n; 
-        /* One file with associated inode and dcache is very roughly 1K. 
+        filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+                        SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+        /*
+         * One file with associated inode and dcache is very roughly 1K.
         * Per default don't use more than 10% of our memory for files. 
         */ 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62dd..d488dcd7f2bb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
 module_init(proc_filesystems_init);
 #endif
-struct file_system_type *get_fs_type(const char *name)
+static struct file_system_type *__get_fs_type(const char *name, int len)
 {
        struct file_system_type *fs;
-        const char *dot = strchr(name, '.');
-        unsigned len = dot ? dot - name : strlen(name);
        read_lock(&file_systems_lock);
        fs = *(find_filesystem(name, len));
        if (fs && !try_module_get(fs->owner))
                fs = NULL;
        read_unlock(&file_systems_lock);
-        if (!fs && (request_module("%.*s", len, name) == 0)) {
+        return fs;
-                read_lock(&file_systems_lock);
+}
-                fs = *(find_filesystem(name, len));
-                if (fs && !try_module_get(fs->owner))
+struct file_system_type *get_fs_type(const char *name)
-                        fs = NULL;
+{
-                read_unlock(&file_systems_lock);
+        struct file_system_type *fs;
-        }
+        const char *dot = strchr(name, '.');
+        int len = dot ? dot - name : strlen(name);
+        fs = __get_fs_type(name, len);
+        if (!fs && (request_module("%.*s", len, name) == 0))
+                fs = __get_fs_type(name, len);
        if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
                put_filesystem(fs);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 9f3f2ceb73f0..03a6ea5e99f7 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
                if (!VXFS_ISIMMED(vip)) {
                        ip->i_op = &page_symlink_inode_operations;
                        ip->i_mapping->a_ops = &vxfs_aops;
-                } else
+                } else {
                        ip->i_op = &vxfs_immed_symlink_iops;
+                        vip->vii_immed.vi_immed[ip->i_size] = '\0';
+                }
        } else
                init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0ff0b8cf309..e5eaa62fd17f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * If we're a pdlfush thread, then implement pdflush collision avoidance
 * against the entire list.
 *
- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
- * that it can be located for waiting on in __writeback_single_inode().
- *
 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
 * This function assumes that the blockdev superblock's inodes are backed by
 * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
                                struct writeback_control *wbc)
 {
        const unsigned long start = jiffies;    /* livelock avoidance */
+        int sync = wbc->sync_mode == WB_SYNC_ALL;
        spin_lock(&inode_lock);
        if (!wbc->for_kupdate || list_empty(&sb->s_io))
@@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                __writeback_single_inode(inode, wbc);
-                if (wbc->sync_mode == WB_SYNC_HOLD) {
-                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_list, &sb->s_dirty);
-                }
                if (current_is_pdflush())
                        writeback_release(bdi);
                if (wbc->pages_skipped != pages_skipped) {
@@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb,
                if (!list_empty(&sb->s_more_io))
                        wbc->more_io = 1;
        }
-        spin_unlock(&inode_lock);
+        if (sync) {
+                struct inode *inode, *old_inode = NULL;
+                /*
+                 * Data integrity sync. Must wait for all pages under writeback,
+                 * because there may have been pages dirtied before our sync
+                 * call, but which had writeout started before we write it out.
+                 * In which case, the inode may not be on the dirty list, but
+                 * we still have to wait for that writeout.
+                 */
+                list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+                        struct address_space *mapping;
+                        if (inode->i_state & (I_FREEING|I_WILL_FREE))
+                                continue;
+                        mapping = inode->i_mapping;
+                        if (mapping->nrpages == 0)
+                                continue;
+                        __iget(inode);
+                        spin_unlock(&inode_lock);
+                        /*
+                         * We hold a reference to 'inode' so it couldn't have
+                         * been removed from s_inodes list while we dropped the
+                         * inode_lock.  We cannot iput the inode now as we can
+                         * be holding the last reference and we cannot iput it
+                         * under inode_lock. So we keep the reference and iput
+                         * it later.
+                         */
+                        iput(old_inode);
+                        old_inode = inode;
+                        filemap_fdatawait(mapping);
+                        cond_resched();
+                        spin_lock(&inode_lock);
+                }
+                spin_unlock(&inode_lock);
+                iput(old_inode);
+        } else
+                spin_unlock(&inode_lock);
        return;         /* Leave any unwritten inodes on s_io */
 }
 EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -588,8 +624,7 @@ restart:
 /*
 * writeback and wait upon the filesystem's dirty inodes.  The caller will
- * do this in two passes - one to write, and one to wait.  WB_SYNC_HOLD is
+ * do this in two passes - one to write, and one to wait.
- * used to park the written inodes on sb->s_dirty for the wait pass.
 *
 * A finite limit is set on the number of pages which will be written.
 * To prevent infinite livelock of sys_sync().
@@ -600,30 +635,21 @@ restart:
 void sync_inodes_sb(struct super_block *sb, int wait)
 {
        struct writeback_control wbc = {
-                .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+                .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
                .range_start    = 0,
                .range_end      = LLONG_MAX,
        };
-        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        wbc.nr_to_write = nr_dirty + nr_unstable +
+        if (!wait) {
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
+                unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-                        nr_dirty + nr_unstable;
+                unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        wbc.nr_to_write += wbc.nr_to_write / 2;         /* Bit more for luck */
-        sync_sb_inodes(sb, &wbc);
-}
-/*
+                wbc.nr_to_write = nr_dirty + nr_unstable +
- * Rather lame livelock avoidance.
+                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- */
+        } else
-static void set_sb_syncing(int val)
+                wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
-{
-        struct super_block *sb;
+        sync_sb_inodes(sb, &wbc);
-        spin_lock(&sb_lock);
-        list_for_each_entry_reverse(sb, &super_blocks, s_list)
-                sb->s_syncing = val;
-        spin_unlock(&sb_lock);
 }
 /**
@@ -652,9 +678,6 @@ static void __sync_inodes(int wait)
        spin_lock(&sb_lock);
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_syncing)
-                        continue;
-                sb->s_syncing = 1;
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
@@ -672,13 +695,10 @@ restart:
 void sync_inodes(int wait)
 {
-        set_sb_syncing(0);
        __sync_inodes(0);
-        if (wait) {
+        if (wait)
-                set_sb_syncing(0);
                __sync_inodes(1);
-        }
 }
 /**
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4f3cab321415..99c99dfb0373 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
        size_t size;
        if (!*ppos) {
+                long value;
                struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
                if (!fc)
                        return 0;
-                file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+                value = atomic_read(&fc->num_waiting);
+                file->private_data = (void *)value;
                fuse_conn_put(fc);
        }
        size = sprintf(tmp, "%ld\n", (long)file->private_data);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fba571648a8e..e0c7ada08a1f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
 * Called with fc->lock, unlocks it
 */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-        __releases(fc->lock)
+__releases(&fc->lock)
 {
        void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
        req->end = NULL;
@@ -293,13 +293,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
        wake_up(&req->waitq);
        if (end)
                end(fc, req);
-        else
+        fuse_put_request(fc, req);
-                fuse_put_request(fc, req);
 }
 static void wait_answer_interruptible(struct fuse_conn *fc,
                                      struct fuse_req *req)
-        __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        if (signal_pending(current))
                return;
@@ -317,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 }
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-        __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        if (!fc->no_interrupt) {
                /* Any signal may interrupt this */
@@ -380,7 +381,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
        spin_lock(&fc->lock);
@@ -399,8 +400,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
        spin_unlock(&fc->lock);
 }
-static void request_send_nowait_locked(struct fuse_conn *fc,
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
-                                       struct fuse_req *req)
+                                            struct fuse_req *req)
 {
        req->background = 1;
        fc->num_background++;
@@ -414,11 +415,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
        flush_bg_queue(fc);
 }
-static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
        spin_lock(&fc->lock);
        if (fc->connected) {
-                request_send_nowait_locked(fc, req);
+                fuse_request_send_nowait_locked(fc, req);
                spin_unlock(&fc->lock);
        } else {
                req->out.h.error = -ENOTCONN;
@@ -426,16 +427,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
        }
 }
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 0;
-        request_send_nowait(fc, req);
+        fuse_request_send_nowait(fc, req);
 }
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
        req->isreply = 1;
-        request_send_nowait(fc, req);
+        fuse_request_send_nowait(fc, req);
 }
 /*
@@ -443,10 +444,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 *
 * fc->connected must have been checked previously
 */
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+                                         struct fuse_req *req)
 {
        req->isreply = 1;
-        request_send_nowait_locked(fc, req);
+        fuse_request_send_nowait_locked(fc, req);
 }
 /*
@@ -539,8 +541,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
                BUG_ON(!cs->nr_segs);
                cs->seglen = cs->iov[0].iov_len;
                cs->addr = (unsigned long) cs->iov[0].iov_base;
-                cs->iov ++;
+                cs->iov++;
-                cs->nr_segs --;
+                cs->nr_segs--;
        }
        down_read(&current->mm->mmap_sem);
        err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
@@ -589,9 +591,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
                kunmap_atomic(mapaddr, KM_USER1);
        }
        while (count) {
-                int err;
+                if (!cs->len) {
-                if (!cs->len && (err = fuse_copy_fill(cs)))
+                        int err = fuse_copy_fill(cs);
-                        return err;
+                        if (err)
+                                return err;
+                }
                if (page) {
                        void *mapaddr = kmap_atomic(page, KM_USER1);
                        void *buf = mapaddr + offset;
@@ -631,9 +635,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 {
        while (size) {
-                int err;
+                if (!cs->len) {
-                if (!cs->len && (err = fuse_copy_fill(cs)))
+                        int err = fuse_copy_fill(cs);
-                        return err;
+                        if (err)
+                                return err;
+                }
                fuse_copy_do(cs, &val, &size);
        }
        return 0;
@@ -664,6 +670,8 @@ static int request_pending(struct fuse_conn *fc)
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        DECLARE_WAITQUEUE(wait, current);
@@ -691,7 +699,7 @@ static void request_wait(struct fuse_conn *fc)
 */
 static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
                               const struct iovec *iov, unsigned long nr_segs)
-        __releases(fc->lock)
+__releases(&fc->lock)
 {
        struct fuse_copy_state cs;
        struct fuse_in_header ih;
@@ -813,6 +821,34 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        return err;
 }
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+                            struct fuse_copy_state *cs)
+{
+        struct fuse_notify_poll_wakeup_out outarg;
+        int err;
+        if (size != sizeof(outarg))
+                return -EINVAL;
+        err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+        if (err)
+                return err;
+        return fuse_notify_poll_wakeup(fc, &outarg);
+}
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+                       unsigned int size, struct fuse_copy_state *cs)
+{
+        switch (code) {
+        case FUSE_NOTIFY_POLL:
+                return fuse_notify_poll(fc, size, cs);
+        default:
+                return -EINVAL;
+        }
+}
 /* Look up request on processing list by unique ID */
 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 {
@@ -876,9 +912,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        err = fuse_copy_one(&cs, &oh, sizeof(oh));
        if (err)
                goto err_finish;
+        err = -EINVAL;
+        if (oh.len != nbytes)
+                goto err_finish;
+        /*
+         * Zero oh.unique indicates unsolicited notification message
+         * and error contains notification code.
+         */
+        if (!oh.unique) {
+                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+                fuse_copy_finish(&cs);
+                return err ? err : nbytes;
+        }
        err = -EINVAL;
-        if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
+        if (oh.error <= -1000 || oh.error > 0)
-            oh.len != nbytes)
                goto err_finish;
        spin_lock(&fc->lock);
@@ -966,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 * This function releases and reacquires fc->lock
 */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        while (!list_empty(head)) {
                struct fuse_req *req;
@@ -988,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 * locked).
 */
 static void end_io_requests(struct fuse_conn *fc)
-        __releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        while (!list_empty(&fc->io)) {
                struct fuse_req *req =
@@ -1002,11 +1055,11 @@ static void end_io_requests(struct fuse_conn *fc)
                wake_up(&req->waitq);
                if (end) {
                        req->end = NULL;
-                        /* The end function will consume this reference */
                        __fuse_get_request(req);
                        spin_unlock(&fc->lock);
                        wait_event(req->waitq, !req->locked);
                        end(fc, req);
+                        fuse_put_request(fc, req);
                        spin_lock(&fc->lock);
                }
        }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 95bc22bdd060..fdff346e96fd 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                parent = dget_parent(entry);
                fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
                                 &entry->d_name, &outarg);
-                request_send(fc, req);
+                fuse_request_send(fc, req);
                dput(parent);
                err = req->out.h.error;
                fuse_put_request(fc, req);
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
                                return 0;
                        }
                        spin_lock(&fc->lock);
-                        fi->nlookup ++;
+                        fi->nlookup++;
                        spin_unlock(&fc->lock);
                }
                fuse_put_request(fc, forget_req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
        attr_version = fuse_get_attr_version(fc);
        fuse_lookup_init(fc, req, nodeid, name, outarg);
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        /* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
 {
        fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
        ff->reserved_req->force = 1;
-        request_send(fc, ff->reserved_req);
+        fuse_request_send(fc, ff->reserved_req);
        fuse_put_request(fc, ff->reserved_req);
        kfree(ff);
 }
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
                goto out_put_forget_req;
        err = -ENOMEM;
-        ff = fuse_file_alloc();
+        ff = fuse_file_alloc(fc);
        if (!ff)
                goto out_put_request;
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        req->out.args[0].value = &outentry;
        req->out.args[1].size = sizeof(outopen);
        req->out.args[1].value = &outopen;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        if (err) {
                if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
        else
                req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err)
@@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
                struct inode *inode = entry->d_inode;
-                /* Set nlink to zero so the inode can be cleared, if
+                /*
-                   the inode does have more links this will be
+                 * Set nlink to zero so the inode can be cleared, if the inode
-                   discovered at the next lookup/getattr */
+                 * does have more links this will be discovered at the next
+                 * lookup/getattr.
+                 */
                clear_nlink(inode);
                fuse_invalidate_attr(inode);
                fuse_invalidate_attr(dir);
@@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
        req->in.numargs = 1;
        req->in.args[0].size = entry->d_name.len + 1;
        req->in.args[0].value = entry->d_name.name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
@@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        req->in.args[1].value = oldent->d_name.name;
        req->in.args[2].size = newent->d_name.len + 1;
        req->in.args[2].value = newent->d_name.name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
@@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
        else
                req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err) {
@@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask)
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
        req->num_pages = 1;
        req->pages[0] = page;
        fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        nbytes = req->out.args[0].size;
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry)
        req->out.numargs = 1;
        req->out.args[0].size = PAGE_SIZE - 1;
        req->out.args[0].value = link;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        if (req->out.h.error) {
                free_page((unsigned long) link);
                link = ERR_PTR(req->out.h.error);
@@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        else
                req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err) {
@@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
        req->in.args[1].value = name;
        req->in.args[2].size = size;
        req->in.args[2].value = value;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
                req->out.args[0].size = sizeof(outarg);
                req->out.args[0].value = &outarg;
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        ret = req->out.h.error;
        if (!ret)
                ret = size ? req->out.args[0].size : outarg.size;
@@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
                req->out.args[0].size = sizeof(outarg);
                req->out.args[0].value = &outarg;
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        ret = req->out.h.error;
        if (!ret)
                ret = size ? req->out.args[0].size : outarg.size;
@@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
        req->in.numargs = 1;
        req->in.args[0].size = strlen(name) + 1;
        req->in.args[0].value = name;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b82..e8162646a9b5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(*outargp);
        req->out.args[0].value = outargp;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        return err;
 }
-struct fuse_file *fuse_file_alloc(void)
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
        struct fuse_file *ff;
        ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -58,7 +58,12 @@ struct fuse_file *fuse_file_alloc(void)
                } else {
                        INIT_LIST_HEAD(&ff->write_entry);
                        atomic_set(&ff->count, 0);
+                        spin_lock(&fc->lock);
+                        ff->kh = ++fc->khctr;
+                        spin_unlock(&fc->lock);
                }
+                RB_CLEAR_NODE(&ff->polled_node);
+                init_waitqueue_head(&ff->poll_wait);
        }
        return ff;
 }
@@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
        dput(req->misc.release.dentry);
        mntput(req->misc.release.vfsmount);
-        fuse_put_request(fc, req);
 }
 static void fuse_file_put(struct fuse_file *ff)
@@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff)
                struct inode *inode = req->misc.release.dentry->d_inode;
                struct fuse_conn *fc = get_fuse_conn(inode);
                req->end = fuse_release_end;
-                request_send_background(fc, req);
+                fuse_request_send_background(fc, req);
                kfree(ff);
        }
 }
@@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file,
 int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 {
+        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_open_out outarg;
        struct fuse_file *ff;
        int err;
@@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
        if (err)
                return err;
-        ff = fuse_file_alloc();
+        ff = fuse_file_alloc(fc);
        if (!ff)
                return -ENOMEM;
@@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
                spin_lock(&fc->lock);
                list_del(&ff->write_entry);
+                if (!RB_EMPTY_NODE(&ff->polled_node))
+                        rb_erase(&ff->polled_node, &fc->polled_files);
                spin_unlock(&fc->lock);
+                wake_up_interruptible_sync(&ff->poll_wait);
                /*
                 * Normally this will send the RELEASE request,
                 * however if some asynchronous READ or WRITE requests
@@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
        req->force = 1;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS) {
@@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
                inarg->read_flags |= FUSE_READ_LOCKOWNER;
                inarg->lock_owner = fuse_lock_owner_id(fc, owner);
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        return req->out.args[0].size;
 }
@@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
        }
        if (req->ff)
                fuse_file_put(req->ff);
-        fuse_put_request(fc, req);
 }
 static void fuse_send_readpages(struct fuse_req *req, struct file *file,
@@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
                struct fuse_file *ff = file->private_data;
                req->ff = fuse_file_get(ff);
                req->end = fuse_readpages_end;
-                request_send_background(fc, req);
+                fuse_request_send_background(fc, req);
        } else {
-                request_send(fc, req);
+                fuse_request_send(fc, req);
                fuse_readpages_end(fc, req);
+                fuse_put_request(fc, req);
        }
 }
@@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
                }
        }
        req->pages[req->num_pages] = page;
-        req->num_pages ++;
+        req->num_pages++;
        return 0;
 }
@@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
                inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
                inarg->lock_owner = fuse_lock_owner_id(fc, owner);
        }
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        return req->misc.write.out.size;
 }
@@ -646,7 +655,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = __grab_cache_page(mapping, index);
+        *pagep = grab_cache_page_write_begin(mapping, index, flags);
        if (!*pagep)
                return -ENOMEM;
        return 0;
@@ -779,7 +788,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                        break;
                err = -ENOMEM;
-                page = __grab_cache_page(mapping, index);
+                page = grab_cache_page_write_begin(mapping, index, 0);
                if (!page)
                        break;
@@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
        __free_page(req->pages[0]);
        fuse_file_put(req->ff);
-        fuse_put_request(fc, req);
 }
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 /* Called under fc->lock, may release and reacquire it */
 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        struct fuse_inode *fi = get_fuse_inode(req->inode);
        loff_t size = i_size_read(req->inode);
@@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
        req->in.args[1].size = inarg->size;
        fi->writectr++;
-        request_send_background_locked(fc, req);
+        fuse_request_send_background_locked(fc, req);
        return;
 out_free:
        fuse_writepage_finish(fc, req);
        spin_unlock(&fc->lock);
        fuse_writepage_free(fc, req);
+        fuse_put_request(fc, req);
        spin_lock(&fc->lock);
 }
@@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
 * Called with fc->lock
 */
 void fuse_flush_writepages(struct inode *inode)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
@@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err)
@@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
                return PTR_ERR(req);
        fuse_lk_fill(req, file, fl, opcode, pid, flock);
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        /* locking is restartable */
        if (err == -EINTR)
@@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
        req->out.numargs = 1;
        req->out.args[0].size = sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (err == -ENOSYS)
@@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
        return retval;
 }
+static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
+                        unsigned int nr_segs, size_t bytes, bool to_user)
+{
+        struct iov_iter ii;
+        int page_idx = 0;
+        if (!bytes)
+                return 0;
+        iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+        while (iov_iter_count(&ii)) {
+                struct page *page = pages[page_idx++];
+                size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
+                void *kaddr, *map;
+                kaddr = map = kmap(page);
+                while (todo) {
+                        char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
+                        size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+                        size_t copy = min(todo, iov_len);
+                        size_t left;
+                        if (!to_user)
+                                left = copy_from_user(kaddr, uaddr, copy);
+                        else
+                                left = copy_to_user(uaddr, kaddr, copy);
+                        if (unlikely(left))
+                                return -EFAULT;
+                        iov_iter_advance(&ii, copy);
+                        todo -= copy;
+                        kaddr += copy;
+                }
+                kunmap(map);
+        }
+        return 0;
+}
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written.  Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs.  Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ *      char    *buf;
+ *      size_t  buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg,   .iov_len = sizeof(struct a)     },
+ *   { .iov_base = a.buf,       .iov_len = a.buflen             } }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY.  This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
+                               unsigned long arg, unsigned int flags)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_ioctl_in inarg = {
+                .fh = ff->fh,
+                .cmd = cmd,
+                .arg = arg,
+                .flags = flags
+        };
+        struct fuse_ioctl_out outarg;
+        struct fuse_req *req = NULL;
+        struct page **pages = NULL;
+        struct page *iov_page = NULL;
+        struct iovec *in_iov = NULL, *out_iov = NULL;
+        unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
+        size_t in_size, out_size, transferred;
+        int err;
+        /* assume all the iovs returned by client always fits in a page */
+        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+        if (!fuse_allow_task(fc, current))
+                return -EACCES;
+        err = -EIO;
+        if (is_bad_inode(inode))
+                goto out;
+        err = -ENOMEM;
+        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+        iov_page = alloc_page(GFP_KERNEL);
+        if (!pages || !iov_page)
+                goto out;
+        /*
+         * If restricted, initialize IO parameters as encoded in @cmd.
+         * RETRY from server is not allowed.
+         */
+        if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+                struct iovec *iov = page_address(iov_page);
+                iov->iov_base = (void __user *)arg;
+                iov->iov_len = _IOC_SIZE(cmd);
+                if (_IOC_DIR(cmd) & _IOC_WRITE) {
+                        in_iov = iov;
+                        in_iovs = 1;
+                }
+                if (_IOC_DIR(cmd) & _IOC_READ) {
+                        out_iov = iov;
+                        out_iovs = 1;
+                }
+        }
+ retry:
+        inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+        inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+        /*
+         * Out data can be used either for actual out data or iovs,
+         * make sure there always is at least one page.
+         */
+        out_size = max_t(size_t, out_size, PAGE_SIZE);
+        max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+        /* make sure there are enough buffer pages and init request with them */
+        err = -ENOMEM;
+        if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+                goto out;
+        while (num_pages < max_pages) {
+                pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+                if (!pages[num_pages])
+                        goto out;
+                num_pages++;
+        }
+        req = fuse_get_req(fc);
+        if (IS_ERR(req)) {
+                err = PTR_ERR(req);
+                req = NULL;
+                goto out;
+        }
+        memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
+        req->num_pages = num_pages;
+        /* okay, let's send it to the client */
+        req->in.h.opcode = FUSE_IOCTL;
+        req->in.h.nodeid = get_node_id(inode);
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        if (in_size) {
+                req->in.numargs++;
+                req->in.args[1].size = in_size;
+                req->in.argpages = 1;
+                err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
+                                           false);
+                if (err)
+                        goto out;
+        }
+        req->out.numargs = 2;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        req->out.args[1].size = out_size;
+        req->out.argpages = 1;
+        req->out.argvar = 1;
+        fuse_request_send(fc, req);
+        err = req->out.h.error;
+        transferred = req->out.args[1].size;
+        fuse_put_request(fc, req);
+        req = NULL;
+        if (err)
+                goto out;
+        /* did it ask for retry? */
+        if (outarg.flags & FUSE_IOCTL_RETRY) {
+                char *vaddr;
+                /* no retry if in restricted mode */
+                err = -EIO;
+                if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+                        goto out;
+                in_iovs = outarg.in_iovs;
+                out_iovs = outarg.out_iovs;
+                /*
+                 * Make sure things are in boundary, separate checks
+                 * are to protect against overflow.
+                 */
+                err = -ENOMEM;
+                if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+                    out_iovs > FUSE_IOCTL_MAX_IOV ||
+                    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+                        goto out;
+                err = -EIO;
+                if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
+                        goto out;
+                /* okay, copy in iovs and retry */
+                vaddr = kmap_atomic(pages[0], KM_USER0);
+                memcpy(page_address(iov_page), vaddr, transferred);
+                kunmap_atomic(vaddr, KM_USER0);
+                in_iov = page_address(iov_page);
+                out_iov = in_iov + in_iovs;
+                goto retry;
+        }
+        err = -EIO;
+        if (transferred > inarg.out_size)
+                goto out;
+        err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ out:
+        if (req)
+                fuse_put_request(fc, req);
+        if (iov_page)
+                __free_page(iov_page);
+        while (num_pages)
+                __free_page(pages[--num_pages]);
+        kfree(pages);
+        return err ? err : outarg.result;
+}
+static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+                            unsigned long arg)
+{
+        return fuse_file_do_ioctl(file, cmd, arg, 0);
+}
+static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+                                   unsigned long arg)
+{
+        return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+                                              struct rb_node **parent_out)
+{
+        struct rb_node **link = &fc->polled_files.rb_node;
+        struct rb_node *last = NULL;
+        while (*link) {
+                struct fuse_file *ff;
+                last = *link;
+                ff = rb_entry(last, struct fuse_file, polled_node);
+                if (kh < ff->kh)
+                        link = &last->rb_left;
+                else if (kh > ff->kh)
+                        link = &last->rb_right;
+                else
+                        return link;
+        }
+        if (parent_out)
+                *parent_out = last;
+        return link;
+}
+/*
+ * The file is about to be polled.  Make sure it's on the polled_files
+ * RB tree.  Note that files once added to the polled_files tree are
+ * not removed before the file is released.  This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+                                      struct fuse_file *ff)
+{
+        spin_lock(&fc->lock);
+        if (RB_EMPTY_NODE(&ff->polled_node)) {
+                struct rb_node **link, *parent;
+                link = fuse_find_polled_node(fc, ff->kh, &parent);
+                BUG_ON(*link);
+                rb_link_node(&ff->polled_node, parent, link);
+                rb_insert_color(&ff->polled_node, &fc->polled_files);
+        }
+        spin_unlock(&fc->lock);
+}
+static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+        struct fuse_poll_out outarg;
+        struct fuse_req *req;
+        int err;
+        if (fc->no_poll)
+                return DEFAULT_POLLMASK;
+        poll_wait(file, &ff->poll_wait, wait);
+        /*
+         * Ask for notification iff there's someone waiting for it.
+         * The client may ignore the flag and always notify.
+         */
+        if (waitqueue_active(&ff->poll_wait)) {
+                inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+                fuse_register_polled_file(fc, ff);
+        }
+        req = fuse_get_req(fc);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->in.h.opcode = FUSE_POLL;
+        req->in.h.nodeid = get_node_id(inode);
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->out.numargs = 1;
+        req->out.args[0].size = sizeof(outarg);
+        req->out.args[0].value = &outarg;
+        fuse_request_send(fc, req);
+        err = req->out.h.error;
+        fuse_put_request(fc, req);
+        if (!err)
+                return outarg.revents;
+        if (err == -ENOSYS) {
+                fc->no_poll = 1;
+                return DEFAULT_POLLMASK;
+        }
+        return POLLERR;
+}
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+                            struct fuse_notify_poll_wakeup_out *outarg)
+{
+        u64 kh = outarg->kh;
+        struct rb_node **link;
+        spin_lock(&fc->lock);
+        link = fuse_find_polled_node(fc, kh, NULL);
+        if (*link) {
+                struct fuse_file *ff;
+                ff = rb_entry(*link, struct fuse_file, polled_node);
+                wake_up_interruptible_sync(&ff->poll_wait);
+        }
+        spin_unlock(&fc->lock);
+        return 0;
+}
 static const struct file_operations fuse_file_operations = {
        .llseek         = fuse_file_llseek,
        .read           = do_sync_read,
@@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = {
        .lock           = fuse_file_lock,
        .flock          = fuse_file_flock,
        .splice_read    = generic_file_splice_read,
+        .unlocked_ioctl = fuse_file_ioctl,
+        .compat_ioctl   = fuse_file_compat_ioctl,
+        .poll           = fuse_file_poll,
 };
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = {
        .fsync          = fuse_fsync,
        .lock           = fuse_file_lock,
        .flock          = fuse_file_flock,
+        .unlocked_ioctl = fuse_file_ioctl,
+        .compat_ioctl   = fuse_file_compat_ioctl,
+        .poll           = fuse_file_poll,
        /* no mmap and splice_read */
 };
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 35accfdd747f..5e64b815a5a1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -19,6 +19,8 @@
 #include <linux/backing-dev.h>
 #include <linux/mutex.h>
 #include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -100,6 +102,9 @@ struct fuse_file {
        /** Request reserved for flush and release */
        struct fuse_req *reserved_req;
+        /** Kernel file handle guaranteed to be unique */
+        u64 kh;
        /** File handle used by userspace */
        u64 fh;
@@ -108,6 +113,12 @@ struct fuse_file {
        /** Entry on inode's write_files list */
        struct list_head write_entry;
+        /** RB node to be linked on fuse_conn->polled_files */
+        struct rb_node polled_node;
+        /** Wait queue head for poll */
+        wait_queue_head_t poll_wait;
 };
 /** One input argument of a request */
@@ -322,6 +333,12 @@ struct fuse_conn {
        /** The list of requests under I/O */
        struct list_head io;
+        /** The next unique kernel file handle */
+        u64 khctr;
+        /** rbtree of fuse_files waiting for poll events indexed by ph */
+        struct rb_root polled_files;
        /** Number of requests currently in the background */
        unsigned num_background;
@@ -355,19 +372,19 @@ struct fuse_conn {
        /** Connection failed (version mismatch).  Cannot race with
            setting other bitfields since it is only set once in INIT
            reply, before any other request, and never cleared */
-        unsigned conn_error : 1;
+        unsigned conn_error:1;
        /** Connection successful.  Only set in INIT */
-        unsigned conn_init : 1;
+        unsigned conn_init:1;
        /** Do readpages asynchronously?  Only set in INIT */
-        unsigned async_read : 1;
+        unsigned async_read:1;
        /** Do not send separate SETATTR request before open(O_TRUNC)  */
-        unsigned atomic_o_trunc : 1;
+        unsigned atomic_o_trunc:1;
        /** Filesystem supports NFS exporting.  Only set in INIT */
-        unsigned export_support : 1;
+        unsigned export_support:1;
        /*
         * The following bitfields are only for optimization purposes
@@ -375,43 +392,46 @@ struct fuse_conn {
         */
        /** Is fsync not implemented by fs? */
-        unsigned no_fsync : 1;
+        unsigned no_fsync:1;
        /** Is fsyncdir not implemented by fs? */
-        unsigned no_fsyncdir : 1;
+        unsigned no_fsyncdir:1;
        /** Is flush not implemented by fs? */
-        unsigned no_flush : 1;
+        unsigned no_flush:1;
        /** Is setxattr not implemented by fs? */
-        unsigned no_setxattr : 1;
+        unsigned no_setxattr:1;
        /** Is getxattr not implemented by fs? */
-        unsigned no_getxattr : 1;
+        unsigned no_getxattr:1;
        /** Is listxattr not implemented by fs? */
-        unsigned no_listxattr : 1;
+        unsigned no_listxattr:1;
        /** Is removexattr not implemented by fs? */
-        unsigned no_removexattr : 1;
+        unsigned no_removexattr:1;
        /** Are file locking primitives not implemented by fs? */
-        unsigned no_lock : 1;
+        unsigned no_lock:1;
        /** Is access not implemented by fs? */
-        unsigned no_access : 1;
+        unsigned no_access:1;
        /** Is create not implemented by fs? */
-        unsigned no_create : 1;
+        unsigned no_create:1;
        /** Is interrupt not implemented by fs? */
-        unsigned no_interrupt : 1;
+        unsigned no_interrupt:1;
        /** Is bmap not implemented by fs? */
-        unsigned no_bmap : 1;
+        unsigned no_bmap:1;
+        /** Is poll not implemented by fs? */
+        unsigned no_poll:1;
        /** Do multi-page cached writes */
-        unsigned big_writes : 1;
+        unsigned big_writes:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
@@ -445,6 +465,9 @@ struct fuse_conn {
        /** Version counter for attribute changes */
        u64 attr_version;
+        /** Called on final put */
+        void (*release)(struct fuse_conn *);
 };
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 */
 int fuse_open_common(struct inode *inode, struct file *file, int isdir);
-struct fuse_file *fuse_file_alloc(void);
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
 void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file,
                      struct fuse_file *ff, struct fuse_open_out *outarg);
@@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
                      int isdir);
 /**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+                            struct fuse_notify_poll_wakeup_out *outarg);
+/**
 * Initialize file operations on a regular file
 */
 void fuse_init_file_inode(struct inode *inode);
@@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 /**
 * Send a request (synchronous)
 */
-void request_send(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 /**
 * Send a request with no reply
 */
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 /**
 * Send a request in the background
 */
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+                                         struct fuse_req *req);
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
@@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
 /**
+ * Initialize fuse_conn
+ */
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+/**
 * Release reference to fuse_conn
 */
 void fuse_conn_put(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e99f34b4435..47c96fdca1ac 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
 /*
  FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
@@ -37,10 +37,10 @@ struct fuse_mount_data {
        unsigned rootmode;
        unsigned user_id;
        unsigned group_id;
-        unsigned fd_present : 1;
+        unsigned fd_present:1;
-        unsigned rootmode_present : 1;
+        unsigned rootmode_present:1;
-        unsigned user_id_present : 1;
+        unsigned user_id_present:1;
-        unsigned group_id_present : 1;
+        unsigned group_id_present:1;
        unsigned flags;
        unsigned max_read;
        unsigned blksize;
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_forget_in);
        req->in.args[0].value = inarg;
-        request_send_noreply(fc, req);
+        fuse_request_send_noreply(fc, req);
 }
 static void fuse_clear_inode(struct inode *inode)
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
        fi = get_fuse_inode(inode);
        spin_lock(&fc->lock);
-        fi->nlookup ++;
+        fi->nlookup++;
        spin_unlock(&fc->lock);
        fuse_change_attributes(inode, attr, attr_valid, attr_version);
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
                fc->destroy_req = NULL;
                req->in.h.opcode = FUSE_DESTROY;
                req->force = 1;
-                request_send(fc, req);
+                fuse_request_send(fc, req);
                fuse_put_request(fc, req);
        }
 }
@@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
        req->out.args[0].size =
                fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
        req->out.args[0].value = &outarg;
-        request_send(fc, req);
+        fuse_request_send(fc, req);
        err = req->out.h.error;
        if (!err)
                convert_fuse_statfs(buf, &outarg.st);
@@ -462,68 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
-static struct fuse_conn *new_conn(struct super_block *sb)
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
 {
-        struct fuse_conn *fc;
        int err;
-        fc = kzalloc(sizeof(*fc), GFP_KERNEL);
+        memset(fc, 0, sizeof(*fc));
-        if (fc) {
+        spin_lock_init(&fc->lock);
-                spin_lock_init(&fc->lock);
+        mutex_init(&fc->inst_mutex);
-                mutex_init(&fc->inst_mutex);
+        atomic_set(&fc->count, 1);
-                atomic_set(&fc->count, 1);
+        init_waitqueue_head(&fc->waitq);
-                init_waitqueue_head(&fc->waitq);
+        init_waitqueue_head(&fc->blocked_waitq);
-                init_waitqueue_head(&fc->blocked_waitq);
+        init_waitqueue_head(&fc->reserved_req_waitq);
-                init_waitqueue_head(&fc->reserved_req_waitq);
+        INIT_LIST_HEAD(&fc->pending);
-                INIT_LIST_HEAD(&fc->pending);
+        INIT_LIST_HEAD(&fc->processing);
-                INIT_LIST_HEAD(&fc->processing);
+        INIT_LIST_HEAD(&fc->io);
-                INIT_LIST_HEAD(&fc->io);
+        INIT_LIST_HEAD(&fc->interrupts);
-                INIT_LIST_HEAD(&fc->interrupts);
+        INIT_LIST_HEAD(&fc->bg_queue);
-                INIT_LIST_HEAD(&fc->bg_queue);
+        INIT_LIST_HEAD(&fc->entry);
-                atomic_set(&fc->num_waiting, 0);
+        atomic_set(&fc->num_waiting, 0);
-                fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-                fc->bdi.unplug_io_fn = default_unplug_io_fn;
+        fc->bdi.unplug_io_fn = default_unplug_io_fn;
-                /* fuse does it's own writeback accounting */
+        /* fuse does it's own writeback accounting */
-                fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+        fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
-                fc->dev = sb->s_dev;
+        fc->khctr = 0;
-                err = bdi_init(&fc->bdi);
+        fc->polled_files = RB_ROOT;
-                if (err)
+        fc->dev = sb->s_dev;
-                        goto error_kfree;
+        err = bdi_init(&fc->bdi);
-                if (sb->s_bdev) {
+        if (err)
-                        err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+                goto error_mutex_destroy;
-                                           MAJOR(fc->dev), MINOR(fc->dev));
+        if (sb->s_bdev) {
-                } else {
+                err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
-                        err = bdi_register_dev(&fc->bdi, fc->dev);
+                                   MAJOR(fc->dev), MINOR(fc->dev));
-                }
+        } else {
-                if (err)
+                err = bdi_register_dev(&fc->bdi, fc->dev);
-                        goto error_bdi_destroy;
-                /*
-                 * For a single fuse filesystem use max 1% of dirty +
-                 * writeback threshold.
-                 *
-                 * This gives about 1M of write buffer for memory maps on a
-                 * machine with 1G and 10% dirty_ratio, which should be more
-                 * than enough.
-                 *
-                 * Privileged users can raise it by writing to
-                 *
-                 *    /sys/class/bdi/<bdi>/max_ratio
-                 */
-                bdi_set_max_ratio(&fc->bdi, 1);
-                fc->reqctr = 0;
-                fc->blocked = 1;
-                fc->attr_version = 1;
-                get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
        }
-        return fc;
+        if (err)
+                goto error_bdi_destroy;
+        /*
+         * For a single fuse filesystem use max 1% of dirty +
+         * writeback threshold.
+         *
+         * This gives about 1M of write buffer for memory maps on a
+         * machine with 1G and 10% dirty_ratio, which should be more
+         * than enough.
+         *
+         * Privileged users can raise it by writing to
+         *
+         *    /sys/class/bdi/<bdi>/max_ratio
+         */
+        bdi_set_max_ratio(&fc->bdi, 1);
+        fc->reqctr = 0;
+        fc->blocked = 1;
+        fc->attr_version = 1;
+        get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
-error_bdi_destroy:
+        return 0;
+ error_bdi_destroy:
        bdi_destroy(&fc->bdi);
-error_kfree:
+ error_mutex_destroy:
        mutex_destroy(&fc->inst_mutex);
-        kfree(fc);
+        return err;
-        return NULL;
 }
+EXPORT_SYMBOL_GPL(fuse_conn_init);
 void fuse_conn_put(struct fuse_conn *fc)
 {
@@ -532,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc)
                        fuse_request_free(fc->destroy_req);
                mutex_destroy(&fc->inst_mutex);
                bdi_destroy(&fc->bdi);
-                kfree(fc);
+                fc->release(fc);
        }
 }
@@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
        return fc;
 }
-static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 {
        struct fuse_attr attr;
        memset(&attr, 0, sizeof(attr));
@@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
        return fuse_iget(sb, 1, 0, &attr, 0, 0);
 }
-struct fuse_inode_handle
+struct fuse_inode_handle {
-{
        u64 nodeid;
        u32 generation;
 };
@@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                fc->max_write = max_t(unsigned, 4096, fc->max_write);
                fc->conn_init = 1;
        }
-        fuse_put_request(fc, req);
        fc->blocked = 0;
        wake_up_all(&fc->blocked_waitq);
 }
@@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        req->out.args[0].size = sizeof(struct fuse_init_out);
        req->out.args[0].value = &req->misc.init_out;
        req->end = process_init_reply;
-        request_send_background(fc, req);
+        fuse_request_send_background(fc, req);
+}
+static void fuse_free_conn(struct fuse_conn *fc)
+{
+        kfree(fc);
 }
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -828,10 +832,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (file->f_op != &fuse_dev_operations)
                return -EINVAL;
-        fc = new_conn(sb);
+        fc = kmalloc(sizeof(*fc), GFP_KERNEL);
        if (!fc)
                return -ENOMEM;
+        err = fuse_conn_init(fc, sb);
+        if (err) {
+                kfree(fc);
+                return err;
+        }
+        fc->release = fuse_free_conn;
        fc->flags = d.flags;
        fc->user_id = d.user_id;
        fc->group_id = d.group_id;
@@ -841,7 +852,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_fs_info = fc;
        err = -ENOMEM;
-        root = get_root_inode(sb, d.rootmode);
+        root = fuse_get_root_inode(sb, d.rootmode);
        if (!root)
                goto err;
@@ -952,7 +963,7 @@ static inline void unregister_fuseblk(void)
 static void fuse_inode_init_once(void *foo)
 {
-        struct inode * inode = foo;
+        struct inode *inode = foo;
        inode_init_once(inode);
 }
@@ -1031,7 +1042,7 @@ static int __init fuse_init(void)
 {
        int res;
-        printk("fuse init (API version %i.%i)\n",
+        printk(KERN_INFO "fuse init (API version %i.%i)\n",
               FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
        INIT_LIST_HEAD(&fuse_conn_list);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index ab2f57e3fb87..e563a6449811 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
        tristate "GFS2 file system support"
-        depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
+        depends on EXPERIMENTAL && (64BIT || LBD)
        select FS_POSIX_ACL
        select CRC32
        help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index ec65851ec80a..c1b4ec6a9650 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
        glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
        mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
        ops_fstype.o ops_inode.o ops_super.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e9bd46f27e3..e335dceb6a4f 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
        struct gfs2_ea_location el_this;
        int error;
-        if (!ip->i_di.di_eattr)
+        if (!ip->i_eattr)
                return 0;
        memset(&er, 0, sizeof(struct gfs2_ea_request));
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bec76b1c2bb0..11ffc56f1f81 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
                void *kaddr = kmap(page);
                memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-                       ip->i_di.di_size);
+                       ip->i_disksize);
-                memset(kaddr + ip->i_di.di_size, 0,
+                memset(kaddr + ip->i_disksize, 0,
-                       PAGE_CACHE_SIZE - ip->i_di.di_size);
+                       PAGE_CACHE_SIZE - ip->i_disksize);
                kunmap(page);
                SetPageUptodate(page);
@@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        if (error)
                goto out;
-        if (ip->i_di.di_size) {
+        if (ip->i_disksize) {
                /* Get a free block, fill it with the stuffed data,
                   and write it out to disk */
@@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        di = (struct gfs2_dinode *)dibh->b_data;
        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
-        if (ip->i_di.di_size) {
+        if (ip->i_disksize) {
                *(__be64 *)(di + 1) = cpu_to_be64(block);
                gfs2_add_inode_blocks(&ip->i_inode, 1);
                di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
                }
        }
-        ip->i_di.di_size = size;
+        ip->i_disksize = size;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                goto out;
        if (gfs2_is_stuffed(ip)) {
-                ip->i_di.di_size = size;
+                ip->i_disksize = size;
                ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
@@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
                        error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
                if (!error) {
-                        ip->i_di.di_size = size;
+                        ip->i_disksize = size;
                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-                        ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+                        ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                        gfs2_dinode_out(ip, dibh->b_data);
                }
@@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip)
        if (error)
                goto out;
-        if (!ip->i_di.di_size) {
+        if (!ip->i_disksize) {
                ip->i_height = 0;
                ip->i_goal = ip->i_no_addr;
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        }
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-        ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+        ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
                return -EINVAL;
-        if (size > ip->i_di.di_size)
+        if (size > ip->i_disksize)
                error = do_grow(ip, size);
-        else if (size < ip->i_di.di_size)
+        else if (size < ip->i_disksize)
                error = do_shrink(ip, size);
        else
                /* update time stamps */
@@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
        int error;
-        error = trunc_dealloc(ip, ip->i_di.di_size);
+        error = trunc_dealloc(ip, ip->i_disksize);
        if (!error)
                error = trunc_end(ip);
        return error;
@@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
 }
 /**
- * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
- * @ip: the file
- * @len: the number of bytes to be written to the file
- * @data_blocks: returns the number of data blocks required
- * @ind_blocks: returns the number of indirect blocks required
- *
- */
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-                            unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-        unsigned int tmp;
-        if (gfs2_is_dir(ip)) {
-                *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
-                *ind_blocks = 3 * (sdp->sd_max_jheight - 1);
-        } else {
-                *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
-                *ind_blocks = 3 * (sdp->sd_max_height - 1);
-        }
-        for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
-                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-                *ind_blocks += tmp;
-        }
-}
-/**
 * gfs2_write_alloc_required - figure out if a write will require an allocation
 * @ip: the file being written to
 * @offset: the offset to write to
@@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
        struct buffer_head bh;
        unsigned int shift;
        u64 lblock, lblock_stop, size;
+        u64 end_of_file;
        *alloc_required = 0;
@@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
        *alloc_required = 1;
        shift = sdp->sd_sb.sb_bsize_shift;
-        if (gfs2_is_dir(ip)) {
+        BUG_ON(gfs2_is_dir(ip));
-                unsigned int bsize = sdp->sd_jbsize;
+        end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
-                lblock = offset;
+        lblock = offset >> shift;
-                do_div(lblock, bsize);
+        lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-                lblock_stop = offset + len + bsize - 1;
+        if (lblock_stop > end_of_file)
-                do_div(lblock_stop, bsize);
+                return 0;
-        } else {
-                u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
-                lblock = offset >> shift;
-                lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-                if (lblock_stop > end_of_file)
-                        return 0;
-        }
        size = (lblock_stop - lblock) << shift;
        do {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e6cde2943bd..c983177e05ac 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,10 +10,40 @@
 #ifndef __BMAP_DOT_H__
 #define __BMAP_DOT_H__
+#include "inode.h"
 struct inode;
 struct gfs2_inode;
 struct page;
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
+                                          unsigned int len,
+                                          unsigned int *data_blocks,
+                                          unsigned int *ind_blocks)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        unsigned int tmp;
+        BUG_ON(gfs2_is_dir(ip));
+        *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+        *ind_blocks = 3 * (sdp->sd_max_height - 1);
+        for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+                tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+                *ind_blocks += tmp;
+        }
+}
 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
 int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
@@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
 int gfs2_truncatei_resume(struct gfs2_inode *ip);
 int gfs2_file_dealloc(struct gfs2_inode *ip);
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-                            unsigned int *data_blocks,
-                            unsigned int *ind_blocks);
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
                              unsigned int len, int *alloc_required);
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
deleted file mode 100644
index e51991947d2c..000000000000
--- a/fs/gfs2/daemon.c
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-#include <linux/freezer.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "daemon.h"
-#include "glock.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "super.h"
-#include "util.h"
-/* This uses schedule_timeout() instead of msleep() because it's good for
-   the daemons to wake up more often than the timeout when unmounting so
-   the user's unmount doesn't sit there forever.
-   The kthread functions used to start these daemons block and flush signals. */
-/**
- * gfs2_glockd - Reclaim unused glock structures
- * @sdp: Pointer to GFS2 superblock
- *
- * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
- * Number of daemons can be set by user, with num_glockd mount option.
- */
-int gfs2_glockd(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        while (!kthread_should_stop()) {
-                while (atomic_read(&sdp->sd_reclaim_count))
-                        gfs2_reclaim_glock(sdp);
-                wait_event_interruptible(sdp->sd_reclaim_wq,
-                                         (atomic_read(&sdp->sd_reclaim_count) ||
-                                         kthread_should_stop()));
-                if (freezing(current))
-                        refrigerator();
-        }
-        return 0;
-}
-/**
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-int gfs2_recoverd(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        unsigned long t;
-        while (!kthread_should_stop()) {
-                gfs2_check_journals(sdp);
-                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
-        return 0;
-}
-/**
- * gfs2_quotad - Write cached quota changes into the quota file
- * @sdp: Pointer to GFS2 superblock
- *
- */
-int gfs2_quotad(void *data)
-{
-        struct gfs2_sbd *sdp = data;
-        unsigned long t;
-        int error;
-        while (!kthread_should_stop()) {
-                /* Update the master statfs file */
-                t = sdp->sd_statfs_sync_time +
-                    gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
-                if (time_after_eq(jiffies, t)) {
-                        error = gfs2_statfs_sync(sdp);
-                        if (error &&
-                            error != -EROFS &&
-                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                                fs_err(sdp, "quotad: (1) error=%d\n", error);
-                        sdp->sd_statfs_sync_time = jiffies;
-                }
-                /* Update quota file */
-                t = sdp->sd_quota_sync_time +
-                    gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
-                if (time_after_eq(jiffies, t)) {
-                        error = gfs2_quota_sync(sdp);
-                        if (error &&
-                            error != -EROFS &&
-                            !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                                fs_err(sdp, "quotad: (2) error=%d\n", error);
-                        sdp->sd_quota_sync_time = jiffies;
-                }
-                gfs2_quota_scan(sdp);
-                t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
-        return 0;
-}
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
deleted file mode 100644
index 4be084fb6a62..000000000000
--- a/fs/gfs2/daemon.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __DAEMON_DOT_H__
-#define __DAEMON_DOT_H__
-int gfs2_glockd(void *data);
-int gfs2_recoverd(void *data);
-int gfs2_quotad(void *data);
-#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index eed040d8ba3a..b7c8e5c70791 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -36,7 +36,7 @@
 * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
 * beginning of the leaf block. The dirents reside in leaves when
 *
- * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ * dip->i_diskflags & GFS2_DIF_EXHASH is true
 *
 * Otherwise, the dirents are "linear", within a single stuffed dinode block.
 *
@@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-        if (ip->i_di.di_size < offset + size)
+        if (ip->i_disksize < offset + size)
-                ip->i_di.di_size = offset + size;
+                ip->i_disksize = offset + size;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
@@ -226,8 +226,8 @@ out:
        if (error)
                return error;
-        if (ip->i_di.di_size < offset + copied)
+        if (ip->i_disksize < offset + copied)
-                ip->i_di.di_size = offset + copied;
+                ip->i_disksize = offset + copied;
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
        int copied = 0;
        int error = 0;
-        if (offset >= ip->i_di.di_size)
+        if (offset >= ip->i_disksize)
                return 0;
-        if (offset + size > ip->i_di.di_size)
+        if (offset + size > ip->i_disksize)
-                size = ip->i_di.di_size - offset;
+                size = ip->i_disksize - offset;
        if (!size)
                return 0;
@@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
        struct gfs2_inode *ip = GFS2_I(inode);
        int error;
-        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                struct gfs2_leaf *leaf;
                unsigned hsize = 1 << ip->i_depth;
                unsigned index;
                u64 ln;
-                if (hsize * sizeof(u64) != ip->i_di.di_size) {
+                if (hsize * sizeof(u64) != ip->i_disksize) {
                        gfs2_consist_inode(ip);
                        return ERR_PTR(-EIO);
                }
@@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode)
                return -ENOSPC;
        bn = bh->b_blocknr;
-        gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
+        gfs2_assert(sdp, dip->i_entries < (1 << 16));
-        leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+        leaf->lf_entries = cpu_to_be16(dip->i_entries);
        /*  Copy dirents  */
@@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode)
        for (x = sdp->sd_hash_ptrs; x--; lp++)
                *lp = cpu_to_be64(bn);
-        dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+        dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
        gfs2_add_inode_blocks(&dip->i_inode, 1);
-        dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+        dip->i_diskflags |= GFS2_DIF_EXHASH;
        for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
        dip->i_depth = y;
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        int error = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+        if (hsize * sizeof(u64) != dip->i_disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
@@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
        buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
-        for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+        for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
                error = gfs2_dir_read_data(dip, (char *)buf,
                                            block * sdp->sd_hash_bsize,
                                            sdp->sd_hash_bsize, 1);
@@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
        unsigned depth = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+        if (hsize * sizeof(u64) != dip->i_disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
@@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
        int copied = 0;
        int error;
-        if (!dip->i_di.di_entries)
+        if (!dip->i_entries)
                return 0;
-        if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+        if (dip->i_diskflags & GFS2_DIF_EXHASH)
                return dir_e_read(inode, offset, opaque, filldir);
        if (!gfs2_is_stuffed(dip)) {
@@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
                        error = PTR_ERR(dent);
                        goto out;
                }
-                if (dip->i_di.di_entries != g.offset) {
+                if (dip->i_entries != g.offset) {
                        fs_warn(sdp, "Number of entries corrupt in dir %llu, "
-                                "ip->i_di.di_entries (%u) != g.offset (%u)\n",
+                                "ip->i_entries (%u) != g.offset (%u)\n",
                                (unsigned long long)dip->i_no_addr,
-                                dip->i_di.di_entries,
+                                dip->i_entries,
                                g.offset);
                        error = -EIO;
                        goto out;
                }
                error = do_filldir_main(dip, offset, opaque, filldir, darr,
-                                        dip->i_di.di_entries, &copied);
+                                        dip->i_entries, &copied);
 out:
                kfree(darr);
        }
@@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        dent = gfs2_init_dirent(inode, dent, name, bh);
                        gfs2_inum_out(nip, dent);
                        dent->de_type = cpu_to_be16(type);
-                        if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+                        if (ip->i_diskflags & GFS2_DIF_EXHASH) {
                                leaf = (struct gfs2_leaf *)bh->b_data;
                                be16_add_cpu(&leaf->lf_entries, 1);
                        }
@@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                        if (error)
                                break;
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                        ip->i_di.di_entries++;
+                        ip->i_entries++;
                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                        gfs2_dinode_out(ip, bh->b_data);
                        brelse(bh);
                        error = 0;
                        break;
                }
-                if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+                if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
                        error = dir_make_exhash(inode);
                        if (error)
                                break;
@@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
        }
        dirent_del(dip, bh, prev, dent);
-        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+        if (dip->i_diskflags & GFS2_DIF_EXHASH) {
                struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
                u16 entries = be16_to_cpu(leaf->lf_entries);
                if (!entries)
@@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
        if (error)
                return error;
-        if (!dip->i_di.di_entries)
+        if (!dip->i_entries)
                gfs2_consist_inode(dip);
        gfs2_trans_add_bh(dip->i_gl, bh, 1);
-        dip->i_di.di_entries--;
+        dip->i_entries--;
        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(dip, bh->b_data);
        brelse(bh);
@@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
        gfs2_inum_out(nip, dent);
        dent->de_type = cpu_to_be16(new_type);
-        if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+        if (dip->i_diskflags & GFS2_DIF_EXHASH) {
                brelse(bh);
                error = gfs2_meta_inode_buffer(dip, &bh);
                if (error)
@@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
        int error = 0;
        hsize = 1 << dip->i_depth;
-        if (hsize * sizeof(u64) != dip->i_di.di_size) {
+        if (hsize * sizeof(u64) != dip->i_disksize) {
                gfs2_consist_inode(dip);
                return -EIO;
        }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 8a468cac9328..4f919440c3be 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -11,6 +11,7 @@
 #define __DIR_DOT_H__
 #include <linux/dcache.h>
+#include <linux/crc32.h>
 struct inode;
 struct gfs2_inode;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index e3f76f451b0a..0d1c76d906ae 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
        __be64 *eablk, *end;
        int error;
-        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+        error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
        if (error)
                return error;
-        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+        if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
                error = ea_foreach_i(ip, bh, ea_call, data);
                goto out;
        }
@@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        if (error)
                return error;
-        if (ip->i_di.di_eattr) {
+        if (ip->i_eattr) {
                struct ea_list ei = { .ei_er = er, .ei_size = 0 };
                error = ea_foreach(ip, ea_list_i, &ei);
@@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        struct gfs2_ea_location el;
        int error;
-        if (!ip->i_di.di_eattr)
+        if (!ip->i_eattr)
                return -ENODATA;
        error = gfs2_ea_find(ip, er, &el);
@@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        ip->i_di.di_eattr = bh->b_blocknr;
+        ip->i_eattr = bh->b_blocknr;
        error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
        brelse(bh);
@@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        int error;
        int mh_size = sizeof(struct gfs2_meta_header);
-        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+        if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
                __be64 *end;
-                error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+                error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
                                       &indbh);
                if (error)
                        return error;
@@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                gfs2_buffer_clear_tail(indbh, mh_size);
                eablk = (__be64 *)(indbh->b_data + mh_size);
-                *eablk = cpu_to_be64(ip->i_di.di_eattr);
+                *eablk = cpu_to_be64(ip->i_eattr);
-                ip->i_di.di_eattr = blk;
+                ip->i_eattr = blk;
-                ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+                ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
                gfs2_add_inode_blocks(&ip->i_inode, 1);
                eablk++;
@@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+        if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
                blks++;
        if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
                blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
@@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        struct gfs2_ea_location el;
        int error;
-        if (!ip->i_di.di_eattr) {
+        if (!ip->i_eattr) {
                if (er->er_flags & XATTR_REPLACE)
                        return -ENODATA;
                return ea_init(ip, er);
@@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
                return error;
        if (el.el_ea) {
-                if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+                if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
                        brelse(el.el_bh);
                        return -EPERM;
                }
@@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        struct gfs2_ea_location el;
        int error;
-        if (!ip->i_di.di_eattr)
+        if (!ip->i_eattr)
                return -ENODATA;
        error = gfs2_ea_find(ip, er, &el);
@@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-        error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+        error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
        if (error)
                return error;
@@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        if (bstart)
                gfs2_free_meta(ip, bstart, blen);
-        ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+        ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
        struct buffer_head *dibh;
        int error;
-        rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+        rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
        if (!rgd) {
                gfs2_consist_inode(ip);
                return -EIO;
@@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
        if (error)
                goto out_gunlock;
-        gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+        gfs2_free_meta(ip, ip->i_eattr, 1);
-        ip->i_di.di_eattr = 0;
+        ip->i_eattr = 0;
        gfs2_add_inode_blocks(&ip->i_inode, -1);
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
        if (error)
                goto out_rindex;
-        if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+        if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
                error = ea_dealloc_indirect(ip);
                if (error)
                        goto out_rindex;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c962283d4e7f..6b983aef785d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -40,6 +40,7 @@
 #include "quota.h"
 #include "super.h"
 #include "util.h"
+#include "bmap.h"
 struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
@@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
-static struct task_struct *scand_process;
-static unsigned int scand_secs = 5;
 static struct workqueue_struct *glock_workqueue;
+static LIST_HEAD(lru_list);
+static atomic_t lru_count = ATOMIC_INIT(0);
+static DEFINE_SPINLOCK(lru_lock);
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 }
 /**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+        spin_lock(&lru_lock);
+        if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+                list_add_tail(&gl->gl_lru, &lru_list);
+                atomic_inc(&lru_count);
+        }
+        spin_unlock(&lru_lock);
+}
+/**
 * gfs2_glock_put() - Decrement reference count on glock
 * @gl: The glock to put
 *
@@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl)
        if (atomic_dec_and_test(&gl->gl_ref)) {
                hlist_del(&gl->gl_list);
                write_unlock(gl_lock_addr(gl->gl_hash));
+                spin_lock(&lru_lock);
+                if (!list_empty(&gl->gl_lru)) {
+                        list_del_init(&gl->gl_lru);
+                        atomic_dec(&lru_count);
+                }
+                spin_unlock(&lru_lock);
                GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
                GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
                glock_free(gl);
                rv = 1;
                goto out;
        }
        write_unlock(gl_lock_addr(gl->gl_hash));
+        /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+        if (atomic_read(&gl->gl_ref) == 2)
+                gfs2_glock_schedule_for_reclaim(gl);
 out:
        return rv;
 }
@@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
 * do_promote - promote as many requests as possible on the current queue
 * @gl: The glock
 * 
- * Returns: true if there is a blocked holder at the head of the list
+ * Returns: 1 if there is a blocked holder at the head of the list, or 2
+ *          if a type specific operation is underway.
 */
 static int do_promote(struct gfs2_glock *gl)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_holder *gh, *tmp;
@@ -310,6 +340,8 @@ restart:
                                ret = glops->go_lock(gh);
                                spin_lock(&gl->gl_spin);
                                if (ret) {
+                                        if (ret == 1)
+                                                return 2;
                                        gh->gh_error = ret;
                                        list_del_init(&gh->gh_list);
                                        gfs2_holder_wake(gh);
@@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_holder *gh;
        unsigned state = ret & LM_OUT_ST_MASK;
+        int rv;
        spin_lock(&gl->gl_spin);
        state_change(gl, state);
@@ -468,7 +501,6 @@ retry:
                gfs2_demote_wake(gl);
        if (state != LM_ST_UNLOCKED) {
                if (glops->go_xmote_bh) {
-                        int rv;
                        spin_unlock(&gl->gl_spin);
                        rv = glops->go_xmote_bh(gl, gh);
                        if (rv == -EAGAIN)
@@ -479,10 +511,13 @@ retry:
                                goto out;
                        }
                }
-                do_promote(gl);
+                rv = do_promote(gl);
+                if (rv == 2)
+                        goto out_locked;
        }
 out:
        clear_bit(GLF_LOCK, &gl->gl_flags);
+out_locked:
        spin_unlock(&gl->gl_spin);
        gfs2_glock_put(gl);
 }
@@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
 */
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
 */
 static void run_queue(struct gfs2_glock *gl, const int nonblock)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        struct gfs2_holder *gh = NULL;
+        int ret;
        if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
                return;
@@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
        } else {
                if (test_bit(GLF_DEMOTE, &gl->gl_flags))
                        gfs2_demote_wake(gl);
-                if (do_promote(gl) == 0)
+                ret = do_promote(gl);
+                if (ret == 0)
                        goto out;
+                if (ret == 2)
+                        return;
                gh = find_first_waiter(gl);
                gl->gl_target = gh->gh_state;
                if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
@@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl)
 */
 static void handle_callback(struct gfs2_glock *gl, unsigned int state,
-                            int remote, unsigned long delay)
+                            unsigned long delay)
 {
        int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
@@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
        if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
                gl->gl_demote_state = state;
                gl->gl_demote_time = jiffies;
-                if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-                    gl->gl_object)
-                        gfs2_glock_schedule_for_reclaim(gl);
        } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
                        gl->gl_demote_state != state) {
                gl->gl_demote_state = LM_ST_UNLOCKED;
@@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 */
 static inline void add_to_queue(struct gfs2_holder *gh)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
        struct gfs2_glock *gl = gh->gh_gl;
        struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
        spin_lock(&gl->gl_spin);
        if (gh->gh_flags & GL_NOCACHE)
-                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                handle_callback(gl, LM_ST_UNLOCKED, 0);
        list_del_init(&gh->gh_list);
        if (find_first_holder(gl) == NULL) {
@@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
                delay = gl->gl_ops->go_min_hold_time;
        spin_lock(&gl->gl_spin);
-        handle_callback(gl, state, 1, delay);
+        handle_callback(gl, state, delay);
        spin_unlock(&gl->gl_spin);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
                gfs2_glock_put(gl);
 }
+static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+        struct gfs2_jdesc *jd;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (jd->jd_jid != jid)
+                        continue;
+                jd->jd_dirty = 1;
+                break;
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+}
 /**
 * gfs2_glock_cb - Callback used by locking module
 * @sdp: Pointer to the superblock
@@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 * Returns: 1 if it's ok
 */
-static int demote_ok(struct gfs2_glock *gl)
+static int demote_ok(const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        int demote = 1;
-        if (test_bit(GLF_STICKY, &gl->gl_flags))
-                demote = 0;
-        else if (glops->go_demote_ok)
-                demote = glops->go_demote_ok(gl);
-        return demote;
-}
-/**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
- * @gl: the glock
- *
- */
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
-        struct gfs2_sbd *sdp = gl->gl_sbd;
-        spin_lock(&sdp->sd_reclaim_lock);
+        if (gl->gl_state == LM_ST_UNLOCKED)
-        if (list_empty(&gl->gl_reclaim)) {
+                return 0;
-                gfs2_glock_hold(gl);
+        if (!list_empty(&gl->gl_holders))
-                list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
+                return 0;
-                atomic_inc(&sdp->sd_reclaim_count);
+        if (glops->go_demote_ok)
-                spin_unlock(&sdp->sd_reclaim_lock);
+                return glops->go_demote_ok(gl);
-                wake_up(&sdp->sd_reclaim_wq);
+        return 1;
-        } else
-                spin_unlock(&sdp->sd_reclaim_lock);
 }
-/**
- * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
- * @sdp: the filesystem
- *
- * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
- * different glock and we notice that there are a lot of glocks in the
- * reclaim list.
- *
- */
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
-        int done_callback = 0;
+        int may_demote;
+        int nr_skipped = 0;
+        int got_ref = 0;
+        LIST_HEAD(skipped);
-        spin_lock(&sdp->sd_reclaim_lock);
+        if (nr == 0)
-        if (list_empty(&sdp->sd_reclaim_list)) {
+                goto out;
-                spin_unlock(&sdp->sd_reclaim_lock);
-                return;
-        }
-        gl = list_entry(sdp->sd_reclaim_list.next,
-                        struct gfs2_glock, gl_reclaim);
-        list_del_init(&gl->gl_reclaim);
-        spin_unlock(&sdp->sd_reclaim_lock);
-        atomic_dec(&sdp->sd_reclaim_count);
+        if (!(gfp_mask & __GFP_FS))
-        atomic_inc(&sdp->sd_reclaimed);
+                return -1;
-        spin_lock(&gl->gl_spin);
+        spin_lock(&lru_lock);
-        if (find_first_holder(gl) == NULL &&
+        while(nr && !list_empty(&lru_list)) {
-            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
+                gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
-                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                list_del_init(&gl->gl_lru);
-                done_callback = 1;
+                atomic_dec(&lru_count);
+                /* Test for being demotable */
+                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+                        gfs2_glock_hold(gl);
+                        got_ref = 1;
+                        spin_unlock(&lru_lock);
+                        spin_lock(&gl->gl_spin);
+                        may_demote = demote_ok(gl);
+                        spin_unlock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
+                        if (may_demote) {
+                                handle_callback(gl, LM_ST_UNLOCKED, 0);
+                                nr--;
+                                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                                        gfs2_glock_put(gl);
+                        }
+                        spin_lock(&lru_lock);
+                        if (may_demote)
+                                continue;
+                }
+                if (list_empty(&gl->gl_lru) &&
+                    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
+                        nr_skipped++;
+                        list_add(&gl->gl_lru, &skipped);
+                }
+                if (got_ref) {
+                        spin_unlock(&lru_lock);
+                        gfs2_glock_put(gl);
+                        spin_lock(&lru_lock);
+                        got_ref = 0;
+                }
        }
-        spin_unlock(&gl->gl_spin);
+        list_splice(&skipped, &lru_list);
-        if (!done_callback ||
+        atomic_add(nr_skipped, &lru_count);
-            queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+        spin_unlock(&lru_lock);
-                gfs2_glock_put(gl);
+out:
+        return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
 }
+static struct shrinker glock_shrinker = {
+        .shrink = gfs2_shrink_glock_memory,
+        .seeks = DEFAULT_SEEKS,
+};
 /**
 * examine_bucket - Call a function for glock in a hash bucket
 * @examiner: the function
@@ -1457,26 +1516,6 @@ out:
 }
 /**
- * scan_glock - look at a glock and see if we can reclaim it
- * @gl: the glock to look at
- *
- */
-static void scan_glock(struct gfs2_glock *gl)
-{
-        if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
-                return;
-        if (test_bit(GLF_LOCK, &gl->gl_flags))
-                return;
-        spin_lock(&gl->gl_spin);
-        if (find_first_holder(gl) == NULL &&
-            gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-                gfs2_glock_schedule_for_reclaim(gl);
-        spin_unlock(&gl->gl_spin);
-}
-/**
 * clear_glock - look at a glock and see if we can free it from glock cache
 * @gl: the glock to look at
 *
@@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl)
 static void clear_glock(struct gfs2_glock *gl)
 {
-        struct gfs2_sbd *sdp = gl->gl_sbd;
+        spin_lock(&lru_lock);
-        int released;
+        if (!list_empty(&gl->gl_lru)) {
+                list_del_init(&gl->gl_lru);
-        spin_lock(&sdp->sd_reclaim_lock);
+                atomic_dec(&lru_count);
-        if (!list_empty(&gl->gl_reclaim)) {
-                list_del_init(&gl->gl_reclaim);
-                atomic_dec(&sdp->sd_reclaim_count);
-                spin_unlock(&sdp->sd_reclaim_lock);
-                released = gfs2_glock_put(gl);
-                gfs2_assert(sdp, !released);
-        } else {
-                spin_unlock(&sdp->sd_reclaim_lock);
        }
+        spin_unlock(&lru_lock);
        spin_lock(&gl->gl_spin);
        if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
-                handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+                handle_callback(gl, LM_ST_UNLOCKED, 0);
        spin_unlock(&gl->gl_spin);
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
        }
 }
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
+{
+        struct gfs2_glock *gl = ip->i_gl;
+        int ret;
+        ret = gfs2_truncatei_resume(ip);
+        gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+        spin_lock(&gl->gl_spin);
+        clear_bit(GLF_LOCK, &gl->gl_flags);
+        run_queue(gl, 1);
+        spin_unlock(&gl->gl_spin);
+}
 static const char *state2str(unsigned state)
 {
        switch(state) {
@@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
        char *p = buf;
        if (test_bit(GLF_LOCK, gflags))
                *p++ = 'l';
-        if (test_bit(GLF_STICKY, gflags))
-                *p++ = 's';
        if (test_bit(GLF_DEMOTE, gflags))
                *p++ = 'D';
        if (test_bit(GLF_PENDING_DEMOTE, gflags))
@@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
        return error;
 }
-/**
- * gfs2_scand - Look for cached glocks and inodes to toss from memory
- * @sdp: Pointer to GFS2 superblock
- *
- * One of these daemons runs, finding candidates to add to sd_reclaim_list.
- * See gfs2_glockd()
- */
-static int gfs2_scand(void *data)
-{
-        unsigned x;
-        unsigned delay;
-        while (!kthread_should_stop()) {
-                for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-                        examine_bucket(scan_glock, NULL, x);
-                if (freezing(current))
-                        refrigerator();
-                delay = scand_secs;
-                if (delay < 1)
-                        delay = 1;
-                schedule_timeout_interruptible(delay * HZ);
-        }
-        return 0;
-}
 int __init gfs2_glock_init(void)
 {
@@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void)
        }
 #endif
-        scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
-        if (IS_ERR(scand_process))
-                return PTR_ERR(scand_process);
        glock_workqueue = create_workqueue("glock_workqueue");
-        if (IS_ERR(glock_workqueue)) {
+        if (IS_ERR(glock_workqueue))
-                kthread_stop(scand_process);
                return PTR_ERR(glock_workqueue);
-        }
+        register_shrinker(&glock_shrinker);
        return 0;
 }
 void gfs2_glock_exit(void)
 {
+        unregister_shrinker(&glock_shrinker);
        destroy_workqueue(glock_workqueue);
-        kthread_stop(scand_process);
 }
-module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
        struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 695c6b193611..543ec7ecfbda 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c6c318c2a0f6..8522d3aa64fc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 * Returns: 1 if it's ok
 */
-static int inode_go_demote_ok(struct gfs2_glock *gl)
+static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
-        int demote = 0;
+        if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
+                return 0;
-        if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
+        return 1;
-                demote = 1;
-        else if (!sdp->sd_args.ar_localcaching &&
-                 time_after_eq(jiffies, gl->gl_stamp +
-                               gfs2_tune_get(sdp, gt_demote_secs) * HZ))
-                demote = 1;
-        return demote;
 }
 /**
@@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl)
 static int inode_go_lock(struct gfs2_holder *gh)
 {
        struct gfs2_glock *gl = gh->gh_gl;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct gfs2_inode *ip = gl->gl_object;
        int error = 0;
@@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh)
                        return error;
        }
-        if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+        if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
            (gl->gl_state == LM_ST_EXCLUSIVE) &&
-            (gh->gh_state == LM_ST_EXCLUSIVE))
+            (gh->gh_state == LM_ST_EXCLUSIVE)) {
-                error = gfs2_truncatei_resume(ip);
+                spin_lock(&sdp->sd_trunc_lock);
+                if (list_empty(&ip->i_trunc_list))
+                        list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
+                spin_unlock(&sdp->sd_trunc_lock);
+                wake_up(&sdp->sd_quota_wait);
+                return 1;
+        }
        return error;
 }
@@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
        const struct gfs2_inode *ip = gl->gl_object;
        if (ip == NULL)
                return 0;
-        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+        gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
                  (unsigned long long)ip->i_no_formal_ino,
                  (unsigned long long)ip->i_no_addr,
-                  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+                  IF2DT(ip->i_inode.i_mode), ip->i_flags,
+                  (unsigned int)ip->i_diskflags,
+                  (unsigned long long)ip->i_inode.i_size,
+                  (unsigned long long)ip->i_disksize);
        return 0;
 }
@@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 * Returns: 1 if it's ok
 */
-static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
 {
        return !gl->gl_aspace->i_mapping->nrpages;
 }
@@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
        const struct gfs2_rgrpd *rgd = gl->gl_object;
        if (rgd == NULL)
                return 0;
-        gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
        return 0;
 }
@@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 }
 /**
+ * trans_go_demote_ok
+ * @gl: the glock
+ *
+ * Always returns 0
+ */
+static int trans_go_demote_ok(const struct gfs2_glock *gl)
+{
+        return 0;
+}
+/**
 * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
 * @gl: the glock
 *
 * Returns: 1 if it's ok
 */
-static int quota_go_demote_ok(struct gfs2_glock *gl)
+static int quota_go_demote_ok(const struct gfs2_glock *gl)
 {
        return !atomic_read(&gl->gl_lvb_count);
 }
@@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 const struct gfs2_glock_operations gfs2_trans_glops = {
        .go_xmote_th = trans_go_sync,
        .go_xmote_bh = trans_go_xmote_bh,
+        .go_demote_ok = trans_go_demote_ok,
        .go_type = LM_TYPE_NONDISK,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f566ec1b4e8e..608849d00021 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,12 +68,6 @@ struct gfs2_bitmap {
        u32 bi_len;
 };
-struct gfs2_rgrp_host {
-        u32 rg_free;
-        u32 rg_dinodes;
-        u64 rg_igeneration;
-};
 struct gfs2_rgrpd {
        struct list_head rd_list;       /* Link with superblock */
        struct list_head rd_list_mru;
@@ -83,14 +77,16 @@ struct gfs2_rgrpd {
        u32 rd_length;                  /* length of rgrp header in fs blocks */
        u32 rd_data;                    /* num of data blocks in rgrp */
        u32 rd_bitbytes;                /* number of bytes in data bitmaps */
-        struct gfs2_rgrp_host rd_rg;
+        u32 rd_free;
+        u32 rd_free_clone;
+        u32 rd_dinodes;
+        u64 rd_igeneration;
        struct gfs2_bitmap *rd_bits;
-        unsigned int rd_bh_count;
        struct mutex rd_mutex;
-        u32 rd_free_clone;
        struct gfs2_log_element rd_le;
-        u32 rd_last_alloc;
        struct gfs2_sbd *rd_sbd;
+        unsigned int rd_bh_count;
+        u32 rd_last_alloc;
        unsigned char rd_flags;
 #define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
 #define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
@@ -129,7 +125,7 @@ struct gfs2_glock_operations {
        void (*go_xmote_th) (struct gfs2_glock *gl);
        int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
-        int (*go_demote_ok) (struct gfs2_glock *gl);
+        int (*go_demote_ok) (const struct gfs2_glock *gl);
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
@@ -159,7 +155,6 @@ struct gfs2_holder {
 enum {
        GLF_LOCK                        = 1,
-        GLF_STICKY                      = 2,
        GLF_DEMOTE                      = 3,
        GLF_PENDING_DEMOTE              = 4,
        GLF_DEMOTE_IN_PROGRESS          = 5,
@@ -194,7 +189,7 @@ struct gfs2_glock {
        unsigned long gl_tchange;
        void *gl_object;
-        struct list_head gl_reclaim;
+        struct list_head gl_lru;
        struct gfs2_sbd *gl_sbd;
@@ -233,29 +228,24 @@ enum {
        GIF_USER                = 4, /* user inode, not metadata addr space */
 };
-struct gfs2_dinode_host {
-        u64 di_size;            /* number of bytes in file */
-        u64 di_generation;      /* generation number for NFS */
-        u32 di_flags;           /* GFS2_DIF_... */
-        /* These only apply to directories  */
-        u32 di_entries;         /* The number of entries in the directory */
-        u64 di_eattr;           /* extended attribute block number */
-};
 struct gfs2_inode {
        struct inode i_inode;
        u64 i_no_addr;
        u64 i_no_formal_ino;
+        u64 i_generation;
+        u64 i_eattr;
+        loff_t i_disksize;
        unsigned long i_flags;          /* GIF_... */
-        struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
        struct gfs2_glock *i_gl; /* Move into i_gh? */
        struct gfs2_holder i_iopen_gh;
        struct gfs2_holder i_gh; /* for prepare/commit_write only */
        struct gfs2_alloc *i_alloc;
        u64 i_goal;     /* goal block for allocations */
        struct rw_semaphore i_rw_mutex;
+        struct list_head i_trunc_list;
+        u32 i_entries;
+        u32 i_diskflags;
        u8 i_height;
        u8 i_depth;
 };
@@ -406,13 +396,11 @@ struct gfs2_args {
 struct gfs2_tune {
        spinlock_t gt_spin;
-        unsigned int gt_demote_secs; /* Cache retention for unheld glock */
        unsigned int gt_incore_log_blocks;
        unsigned int gt_log_flush_secs;
        unsigned int gt_recoverd_secs;
        unsigned int gt_logd_secs;
-        unsigned int gt_quotad_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
        unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
@@ -488,10 +476,6 @@ struct gfs2_sbd {
        /* Lock Stuff */
        struct lm_lockstruct sd_lockstruct;
-        struct list_head sd_reclaim_list;
-        spinlock_t sd_reclaim_lock;
-        wait_queue_head_t sd_reclaim_wq;
-        atomic_t sd_reclaim_count;
        struct gfs2_holder sd_live_gh;
        struct gfs2_glock *sd_rename_gl;
        struct gfs2_glock *sd_trans_gl;
@@ -519,7 +503,6 @@ struct gfs2_sbd {
        spinlock_t sd_statfs_spin;
        struct gfs2_statfs_change_host sd_statfs_master;
        struct gfs2_statfs_change_host sd_statfs_local;
-        unsigned long sd_statfs_sync_time;
        /* Resource group stuff */
@@ -552,8 +535,6 @@ struct gfs2_sbd {
        struct task_struct *sd_recoverd_process;
        struct task_struct *sd_logd_process;
        struct task_struct *sd_quotad_process;
-        struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
-        unsigned int sd_glockd_num;
        /* Quota stuff */
@@ -561,13 +542,15 @@ struct gfs2_sbd {
        atomic_t sd_quota_count;
        spinlock_t sd_quota_spin;
        struct mutex sd_quota_mutex;
+        wait_queue_head_t sd_quota_wait;
+        struct list_head sd_trunc_list;
+        spinlock_t sd_trunc_lock;
        unsigned int sd_quota_slots;
        unsigned int sd_quota_chunks;
        unsigned char **sd_quota_bitmap;
        u64 sd_quota_sync_gen;
-        unsigned long sd_quota_sync_time;
        /* Log stuff */
@@ -624,10 +607,6 @@ struct gfs2_sbd {
        struct mutex sd_freeze_lock;
        unsigned int sd_freeze_count;
-        /* Counters */
-        atomic_t sd_reclaimed;
        char sd_fsname[GFS2_FSNAME_LEN];
        char sd_table_name[GFS2_FSNAME_LEN];
        char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d57616840e89..3b87c188da41 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -32,7 +32,6 @@
 #include "log.h"
 #include "meta_io.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
@@ -248,7 +247,6 @@ fail:
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
-        struct gfs2_dinode_host *di = &ip->i_di;
        const struct gfs2_dinode *str = buf;
        struct timespec atime;
        u16 height, depth;
@@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
         * to do that.
         */
        ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-        di->di_size = be64_to_cpu(str->di_size);
+        ip->i_disksize = be64_to_cpu(str->di_size);
-        i_size_write(&ip->i_inode, di->di_size);
+        i_size_write(&ip->i_inode, ip->i_disksize);
        gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
        atime.tv_sec = be64_to_cpu(str->di_atime);
        atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
        ip->i_goal = be64_to_cpu(str->di_goal_meta);
-        di->di_generation = be64_to_cpu(str->di_generation);
+        ip->i_generation = be64_to_cpu(str->di_generation);
-        di->di_flags = be32_to_cpu(str->di_flags);
+        ip->i_diskflags = be32_to_cpu(str->di_flags);
        gfs2_set_inode_flags(&ip->i_inode);
        height = be16_to_cpu(str->di_height);
        if (unlikely(height > GFS2_MAX_META_HEIGHT))
@@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
        if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
                goto corrupt;
        ip->i_depth = (u8)depth;
-        di->di_entries = be32_to_cpu(str->di_entries);
+        ip->i_entries = be32_to_cpu(str->di_entries);
-        di->di_eattr = be64_to_cpu(str->di_eattr);
+        ip->i_eattr = be64_to_cpu(str->di_eattr);
        if (S_ISREG(ip->i_inode.i_mode))
                gfs2_set_aops(&ip->i_inode);
@@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
        gfs2_free_di(rgd, ip);
        gfs2_trans_end(sdp);
-        clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
 out_rg_gunlock:
        gfs2_glock_dq_uninit(&al->al_rgd_gh);
@@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
                return error;
        }
-        if (dip->i_di.di_entries == (u32)-1)
+        if (dip->i_entries == (u32)-1)
                return -EFBIG;
        if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
                return -EMLINK;
@@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        di->di_flags = 0;
        if (S_ISREG(mode)) {
-                if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+                if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
                    gfs2_tune_get(sdp, gt_new_files_jdata))
                        di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
        } else if (S_ISDIR(mode)) {
-                di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+                di->di_flags |= cpu_to_be32(dip->i_diskflags &
                                            GFS2_DIF_INHERIT_JDATA);
        }
@@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
        struct qstr dotname;
        int error;
-        if (ip->i_di.di_entries != 2) {
+        if (ip->i_entries != 2) {
                if (gfs2_consist_inode(ip))
                        gfs2_dinode_print(ip);
                return -EIO;
@@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
                return error;
        }
-        if (!ip->i_di.di_size) {
+        if (!ip->i_disksize) {
                gfs2_consist_inode(ip);
                error = -EIO;
                goto out;
@@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
        if (error)
                goto out;
-        x = ip->i_di.di_size + 1;
+        x = ip->i_disksize + 1;
        if (x > *len) {
                *buf = kmalloc(x, GFP_NOFS);
                if (!*buf) {
@@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 {
-        const struct gfs2_dinode_host *di = &ip->i_di;
        struct gfs2_dinode *str = buf;
        str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
        str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
        str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-        str->di_size = cpu_to_be64(di->di_size);
+        str->di_size = cpu_to_be64(ip->i_disksize);
        str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
        str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
        str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
        str->di_goal_meta = cpu_to_be64(ip->i_goal);
        str->di_goal_data = cpu_to_be64(ip->i_goal);
-        str->di_generation = cpu_to_be64(di->di_generation);
+        str->di_generation = cpu_to_be64(ip->i_generation);
-        str->di_flags = cpu_to_be32(di->di_flags);
+        str->di_flags = cpu_to_be32(ip->i_diskflags);
        str->di_height = cpu_to_be16(ip->i_height);
        str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
-                                             !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
+                                             !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
                                             GFS2_FORMAT_DE : 0);
        str->di_depth = cpu_to_be16(ip->i_depth);
-        str->di_entries = cpu_to_be32(di->di_entries);
+        str->di_entries = cpu_to_be32(ip->i_entries);
-        str->di_eattr = cpu_to_be64(di->di_eattr);
+        str->di_eattr = cpu_to_be64(ip->i_eattr);
        str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
        str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
        str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
@@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 void gfs2_dinode_print(const struct gfs2_inode *ip)
 {
-        const struct gfs2_dinode_host *di = &ip->i_di;
        printk(KERN_INFO "  no_formal_ino = %llu\n",
               (unsigned long long)ip->i_no_formal_ino);
        printk(KERN_INFO "  no_addr = %llu\n",
               (unsigned long long)ip->i_no_addr);
-        printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
+        printk(KERN_INFO "  i_disksize = %llu\n",
+               (unsigned long long)ip->i_disksize);
        printk(KERN_INFO "  blocks = %llu\n",
               (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
        printk(KERN_INFO "  i_goal = %llu\n",
               (unsigned long long)ip->i_goal);
-        printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
+        printk(KERN_INFO "  i_diskflags = 0x%.8X\n", ip->i_diskflags);
        printk(KERN_INFO "  i_height = %u\n", ip->i_height);
        printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
-        printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
+        printk(KERN_INFO "  i_entries = %u\n", ip->i_entries);
-        printk(KERN_INFO "  di_eattr = %llu\n",
+        printk(KERN_INFO "  i_eattr = %llu\n",
-               (unsigned long long)di->di_eattr);
+               (unsigned long long)ip->i_eattr);
 }
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2d43f69610a0..d5329364cdff 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,7 @@
 #ifndef __INODE_DOT_H__
 #define __INODE_DOT_H__
+#include <linux/fs.h>
 #include "util.h"
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
 {
-        return ip->i_di.di_flags & GFS2_DIF_JDATA;
+        return ip->i_diskflags & GFS2_DIF_JDATA;
 }
 static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
@@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
 void gfs2_dinode_print(const struct gfs2_inode *ip);
+extern const struct inode_operations gfs2_file_iops;
+extern const struct inode_operations gfs2_dir_iops;
+extern const struct inode_operations gfs2_symlink_iops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+extern void gfs2_set_inode_flags(struct inode *inode);
 #endif /* __INODE_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0c4cbe6c8285..1aa7eb6a0226 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -194,17 +194,25 @@ out:
 static void gdlm_recovery_done(void *lockspace, unsigned int jid,
                               unsigned int message)
 {
+        char env_jid[20];
+        char env_status[20];
+        char *envp[] = { env_jid, env_status, NULL };
        struct gdlm_ls *ls = lockspace;
        ls->recover_jid_done = jid;
        ls->recover_jid_status = message;
-        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+        sprintf(env_jid, "JID=%d", jid);
+        sprintf(env_status, "RECOVERY=%s",
+                message == LM_RD_SUCCESS ? "Done" : "Failed");
+        kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 static void gdlm_others_may_mount(void *lockspace)
 {
+        char *message = "FIRSTMOUNT=Done";
+        char *envp[] = { message, NULL };
        struct gdlm_ls *ls = lockspace;
        ls->first_done = 1;
-        kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+        kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 /* Userspace gets the offline uevent, blocks new gfs locks on
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4ec571c3d8a9..9b7edcf7bd49 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
        kobject_put(&ls->kobj);
 }
+static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
+                       struct kobj_uevent_env *env)
+{
+        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+        add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
+        add_uevent_var(env, "LOCKPROTO=lock_dlm");
+        return 0;
+}
+static struct kset_uevent_ops gdlm_uevent_ops = {
+        .uevent = gdlm_uevent,
+};
 int gdlm_sysfs_init(void)
 {
-        gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
+        gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
        if (!gdlm_kset) {
                printk(KERN_WARNING "%s: can not create kset\n", __func__);
                return -ENOMEM;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bb2cc303ac29..7cacfde32194 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -19,7 +19,7 @@
 #include "gfs2.h"
 #include "incore.h"
-#include "ops_fstype.h"
+#include "super.h"
 #include "sys.h"
 #include "util.h"
 #include "glock.h"
@@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo)
        inode_init_once(&ip->i_inode);
        init_rwsem(&ip->i_rw_mutex);
+        INIT_LIST_HEAD(&ip->i_trunc_list);
        ip->i_alloc = NULL;
 }
@@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo)
        INIT_LIST_HEAD(&gl->gl_holders);
        gl->gl_lvb = NULL;
        atomic_set(&gl->gl_lvb_count, 0);
-        INIT_LIST_HEAD(&gl->gl_reclaim);
+        INIT_LIST_HEAD(&gl->gl_lru);
        INIT_LIST_HEAD(&gl->gl_ail_list);
        atomic_set(&gl->gl_ail_count, 0);
 }
@@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void)
        if (!gfs2_rgrpd_cachep)
                goto fail;
+        gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
+                                               sizeof(struct gfs2_quota_data),
+                                               0, 0, NULL);
+        if (!gfs2_quotad_cachep)
+                goto fail;
        error = register_filesystem(&gfs2_fs_type);
        if (error)
                goto fail;
@@ -112,6 +119,9 @@ fail_unregister:
 fail:
        gfs2_glock_exit();
+        if (gfs2_quotad_cachep)
+                kmem_cache_destroy(gfs2_quotad_cachep);
        if (gfs2_rgrpd_cachep)
                kmem_cache_destroy(gfs2_rgrpd_cachep);
@@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void)
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
+        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
        kmem_cache_destroy(gfs2_bufdata_cachep);
        kmem_cache_destroy(gfs2_inode_cachep);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f96eb90a2cfa..3cb0a44ba023 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -32,7 +32,6 @@ enum {
        Opt_debug,
        Opt_nodebug,
        Opt_upgrade,
-        Opt_num_glockd,
        Opt_acl,
        Opt_noacl,
        Opt_quota_off,
@@ -57,7 +56,6 @@ static const match_table_t tokens = {
        {Opt_debug, "debug"},
        {Opt_nodebug, "nodebug"},
        {Opt_upgrade, "upgrade"},
-        {Opt_num_glockd, "num_glockd=%d"},
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_quota_off, "quota=off"},
@@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
        int error = 0;
        if (!remount) {
-                /*  If someone preloaded options, use those instead  */
-                spin_lock(&gfs2_sys_margs_lock);
-                if (gfs2_sys_margs) {
-                        data = gfs2_sys_margs;
-                        gfs2_sys_margs = NULL;
-                }
-                spin_unlock(&gfs2_sys_margs_lock);
                /*  Set some defaults  */
-                args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
                args->ar_quota = GFS2_QUOTA_DEFAULT;
                args->ar_data = GFS2_DATA_DEFAULT;
        }
@@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
           process them */
        for (options = data; (o = strsep(&options, ",")); ) {
-                int token, option;
+                int token;
                substring_t tmp[MAX_OPT_ARGS];
                if (!*o)
@@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
                                goto cant_remount;
                        args->ar_upgrade = 1;
                        break;
-                case Opt_num_glockd:
-                        if ((error = match_int(&tmp[0], &option))) {
-                                fs_info(sdp, "problem getting num_glockd\n");
-                                goto out_error;
-                        }
-                        if (remount && option != args->ar_num_glockd)
-                                goto cant_remount;
-                        if (!option || option > GFS2_GLOCKD_MAX) {
-                                fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
-                                        GFS2_GLOCKD_MAX, option);
-                                error = -EINVAL;
-                                goto out_error;
-                        }
-                        args->ar_num_glockd = option;
-                        break;
                case Opt_acl:
                        args->ar_posix_acl = 1;
                        sdp->sd_vfs->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 27563816e1c5..4ddab67867eb 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
 {
        struct inode *inode = page->mapping->host;
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        int error;
+        int ret;
        int done_trans = 0;
-        error = gfs2_writepage_common(page, wbc);
-        if (error <= 0)
-                return error;
        if (PageChecked(page)) {
                if (wbc->sync_mode != WB_SYNC_ALL)
                        goto out_ignore;
-                error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+                ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
-                if (error)
+                if (ret)
                        goto out_ignore;
                done_trans = 1;
        }
-        error = __gfs2_jdata_writepage(page, wbc);
+        ret = gfs2_writepage_common(page, wbc);
+        if (ret > 0)
+                ret = __gfs2_jdata_writepage(page, wbc);
        if (done_trans)
                gfs2_trans_end(sdp);
-        return error;
+        return ret;
 out_ignore:
        redirty_page_for_writepage(wbc, page);
@@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
        kaddr = kmap_atomic(page, KM_USER0);
        memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-               ip->i_di.di_size);
+               ip->i_disksize);
-        memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+        memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
        kunmap_atomic(kaddr, KM_USER0);
        flush_dcache_page(page);
        brelse(dibh);
@@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
        struct gfs2_inode *ip = GFS2_I(mapping->host);
        struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-        unsigned int data_blocks, ind_blocks, rblocks;
+        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
        int alloc_required;
        int error = 0;
        struct gfs2_alloc *al;
@@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        if (unlikely(error))
                goto out_uninit;
-        gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
        if (error)
                goto out_unlock;
+        if (alloc_required || gfs2_is_jdata(ip))
+                gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
        if (alloc_required) {
                al = gfs2_alloc_get(ip);
                if (!al) {
@@ -675,7 +675,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                goto out_trans_fail;
        error = -ENOMEM;
-        page = __grab_cache_page(mapping, index);
+        flags |= AOP_FLAG_NOFS;
+        page = grab_cache_page_write_begin(mapping, index, flags);
        *pagep = page;
        if (unlikely(!page))
                goto out_endtrans;
@@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        if (inode->i_size < to) {
                i_size_write(inode, to);
-                ip->i_di.di_size = inode->i_size;
+                ip->i_disksize = inode->i_size;
                di->di_size = cpu_to_be64(inode->i_size);
                mark_inode_dirty(inode);
        }
@@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-        if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
+        if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
                di = (struct gfs2_dinode *)dibh->b_data;
-                ip->i_di.di_size = inode->i_size;
+                ip->i_disksize = inode->i_size;
                di->di_size = cpu_to_be64(inode->i_size);
                mark_inode_dirty(inode);
        }
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 4a5e676b4420..c2ad36330ca3 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -19,7 +19,7 @@
 #include "incore.h"
 #include "dir.h"
 #include "glock.h"
-#include "ops_dentry.h"
+#include "super.h"
 #include "util.h"
 #include "inode.h"
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
deleted file mode 100644
index 5caa3db4d3f5..000000000000
--- a/fs/gfs2/ops_dentry.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_DENTRY_DOT_H__
-#define __OPS_DENTRY_DOT_H__
-#include <linux/dcache.h>
-extern struct dentry_operations gfs2_dops;
-#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index bbb8c36403a9..7fdeb14ddd1a 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,8 +22,7 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "ops_dentry.h"
+#include "super.h"
-#include "ops_fstype.h"
 #include "rgrp.h"
 #include "util.h"
@@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        }
        error = -EIO;
-        if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+        if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
                iput(inode);
                goto fail;
        }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3a747f8e2188..93fe41b67f97 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -39,7 +39,6 @@
 #include "util.h"
 #include "eaops.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 /**
 * gfs2_llseek - seek to a location in a file
@@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
        if (error)
                return error;
-        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
+        fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
-        if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+        if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
                fsflags |= FS_JOURNAL_DATA_FL;
        if (put_user(fsflags, ptr))
                error = -EFAULT;
@@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 void gfs2_set_inode_flags(struct inode *inode)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_dinode_host *di = &ip->i_di;
        unsigned int flags = inode->i_flags;
        flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-        if (di->di_flags & GFS2_DIF_IMMUTABLE)
+        if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
                flags |= S_IMMUTABLE;
-        if (di->di_flags & GFS2_DIF_APPENDONLY)
+        if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
                flags |= S_APPEND;
-        if (di->di_flags & GFS2_DIF_NOATIME)
+        if (ip->i_diskflags & GFS2_DIF_NOATIME)
                flags |= S_NOATIME;
-        if (di->di_flags & GFS2_DIF_SYNC)
+        if (ip->i_diskflags & GFS2_DIF_SYNC)
                flags |= S_SYNC;
        inode->i_flags = flags;
 }
@@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_drop_write;
-        flags = ip->i_di.di_flags;
+        flags = ip->i_diskflags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
                goto out;
@@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_trans_end;
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-        ip->i_di.di_flags = new_flags;
+        ip->i_diskflags = new_flags;
        gfs2_dinode_out(ip, bh->b_data);
        brelse(bh);
        gfs2_set_inode_flags(inode);
@@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        unsigned long last_index;
-        u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
+        u64 pos = page->index << PAGE_CACHE_SHIFT;
        unsigned int data_blocks, ind_blocks, rblocks;
        int alloc_required = 0;
        struct gfs2_holder gh;
@@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                goto out;
        set_bit(GIF_SW_PAGED, &ip->i_flags);
-        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
        ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
        if (ret || !alloc_required)
                goto out_unlock;
@@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        ret = gfs2_quota_lock_check(ip);
        if (ret)
                goto out_alloc_put;
+        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
        al->al_requested = data_blocks + ind_blocks;
        ret = gfs2_inplace_reserve(ip);
        if (ret)
@@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
                        goto fail;
                if (!(file->f_flags & O_LARGEFILE) &&
-                    ip->i_di.di_size > MAX_NON_LFS) {
+                    ip->i_disksize > MAX_NON_LFS) {
                        error = -EOVERFLOW;
                        goto fail_gunlock;
                }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b117fcf2c4f5..f91eebdde581 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -22,20 +22,18 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "bmap.h"
-#include "daemon.h"
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
 #include "mount.h"
-#include "ops_fstype.h"
-#include "ops_dentry.h"
-#include "ops_super.h"
 #include "recovery.h"
 #include "rgrp.h"
 #include "super.h"
 #include "sys.h"
 #include "util.h"
 #include "log.h"
+#include "quota.h"
+#include "dir.h"
 #define DO 0
 #define UNDO 1
@@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
        spin_lock_init(&gt->gt_spin);
-        gt->gt_demote_secs = 300;
        gt->gt_incore_log_blocks = 1024;
        gt->gt_log_flush_secs = 60;
        gt->gt_recoverd_secs = 60;
        gt->gt_logd_secs = 1;
-        gt->gt_quotad_secs = 5;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
        gt->gt_quota_scale_num = 1;
@@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        gfs2_tune_init(&sdp->sd_tune);
-        INIT_LIST_HEAD(&sdp->sd_reclaim_list);
-        spin_lock_init(&sdp->sd_reclaim_lock);
-        init_waitqueue_head(&sdp->sd_reclaim_wq);
        mutex_init(&sdp->sd_inum_mutex);
        spin_lock_init(&sdp->sd_statfs_spin);
@@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        INIT_LIST_HEAD(&sdp->sd_quota_list);
        spin_lock_init(&sdp->sd_quota_spin);
        mutex_init(&sdp->sd_quota_mutex);
+        init_waitqueue_head(&sdp->sd_quota_wait);
+        INIT_LIST_HEAD(&sdp->sd_trunc_list);
+        spin_lock_init(&sdp->sd_trunc_lock);
        spin_lock_init(&sdp->sd_log_lock);
@@ -443,24 +438,11 @@ out:
 static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
                        int undo)
 {
-        struct task_struct *p;
        int error = 0;
        if (undo)
                goto fail_trans;
-        for (sdp->sd_glockd_num = 0;
-             sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
-             sdp->sd_glockd_num++) {
-                p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
-                error = IS_ERR(p);
-                if (error) {
-                        fs_err(sdp, "can't start glockd thread: %d\n", error);
-                        goto fail;
-                }
-                sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
-        }
        error = gfs2_glock_nq_num(sdp,
                                  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
                                  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
@@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
                fs_err(sdp, "can't create transaction glock: %d\n", error);
                goto fail_rename;
        }
-        set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
        return 0;
@@ -506,9 +487,6 @@ fail_live:
 fail_mount:
        gfs2_glock_dq_uninit(mount_gh);
 fail:
-        while (sdp->sd_glockd_num--)
-                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
        return error;
 }
@@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
        prev_db = 0;
-        for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
+        for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
                bh.b_state = 0;
                bh.b_blocknr = 0;
                bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
                                        sdp->sd_lockstruct.ls_lockspace);
 }
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * Returns: errno
+ */
+static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+        struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+        struct qstr name;
+        char buf[20];
+        struct gfs2_jdesc *jd;
+        int error;
+        name.name = buf;
+        mutex_lock(&sdp->sd_jindex_mutex);
+        for (;;) {
+                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
+                if (error)
+                        break;
+                name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+                name.hash = gfs2_disk_hash(name.name, name.len);
+                error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
+                if (error == -ENOENT) {
+                        error = 0;
+                        break;
+                }
+                gfs2_glock_dq_uninit(ji_gh);
+                if (error)
+                        break;
+                error = -ENOMEM;
+                jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+                if (!jd)
+                        break;
+                INIT_LIST_HEAD(&jd->extent_list);
+                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
+                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+                        if (!jd->jd_inode)
+                                error = -ENOENT;
+                        else
+                                error = PTR_ERR(jd->jd_inode);
+                        kfree(jd);
+                        break;
+                }
+                spin_lock(&sdp->sd_jindex_spin);
+                jd->jd_jid = sdp->sd_journals++;
+                list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+                spin_unlock(&sdp->sd_jindex_spin);
+        }
+        mutex_unlock(&sdp->sd_jindex_mutex);
+        return error;
+}
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
        struct inode *master = sdp->sd_master_dir->d_inode;
@@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                return PTR_ERR(sdp->sd_jindex);
        }
        ip = GFS2_I(sdp->sd_jindex);
-        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
        /* Load in the journal index special file */
@@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
                goto fail_statfs;
        }
        ip = GFS2_I(sdp->sd_rindex);
-        set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
        sdp->sd_rindex_uptodate = 0;
        /* Read in the quota inode */
@@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
        }
        sdp->sd_logd_process = p;
-        sdp->sd_statfs_sync_time = jiffies;
-        sdp->sd_quota_sync_time = jiffies;
        p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
        error = IS_ERR(p);
        if (error) {
@@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 static void gfs2_kill_sb(struct super_block *sb)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        if (sdp) {
-                gfs2_meta_syncfs(sdp);
+        if (sdp == NULL) {
-                dput(sdp->sd_root_dir);
+                kill_block_super(sb);
-                dput(sdp->sd_master_dir);
+                return;
-                sdp->sd_root_dir = NULL;
-                sdp->sd_master_dir = NULL;
        }
+        gfs2_meta_syncfs(sdp);
+        dput(sdp->sd_root_dir);
+        dput(sdp->sd_master_dir);
+        sdp->sd_root_dir = NULL;
+        sdp->sd_master_dir = NULL;
        shrink_dcache_sb(sb);
        kill_block_super(sb);
-        if (sdp)
+        gfs2_delete_debugfs_file(sdp);
-                gfs2_delete_debugfs_file(sdp);
+        kfree(sdp);
 }
 struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
deleted file mode 100644
index da8490511836..000000000000
--- a/fs/gfs2/ops_fstype.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_FSTYPE_DOT_H__
-#define __OPS_FSTYPE_DOT_H__
-#include <linux/fs.h>
-extern struct file_system_type gfs2_fs_type;
-extern struct file_system_type gfs2meta_fs_type;
-extern const struct export_operations gfs2_export_ops;
-#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d232991b9046..49877546beb9 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -19,6 +19,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/fiemap.h>
 #include <asm/uaccess.h>
 #include "gfs2.h"
@@ -31,12 +32,11 @@
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
-#include "ops_dentry.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "super.h"
 /**
 * gfs2_create - Create a file
@@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (!dip->i_inode.i_nlink)
                goto out_gunlock;
        error = -EFBIG;
-        if (dip->i_di.di_entries == (u32)-1)
+        if (dip->i_entries == (u32)-1)
                goto out_gunlock;
        error = -EPERM;
        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
        ip = ghs[1].gh_gl->gl_object;
-        ip->i_di.di_size = size;
+        ip->i_disksize = size;
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        ip = ghs[1].gh_gl->gl_object;
        ip->i_inode.i_nlink = 2;
-        ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+        ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
-        ip->i_di.di_flags |= GFS2_DIF_JDATA;
+        ip->i_diskflags |= GFS2_DIF_JDATA;
-        ip->i_di.di_entries = 2;
+        ip->i_entries = 2;
        error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
        if (error)
                goto out_gunlock;
-        if (ip->i_di.di_entries < 2) {
+        if (ip->i_entries < 2) {
                if (gfs2_consist_inode(ip))
                        gfs2_dinode_print(ip);
                error = -EIO;
                goto out_gunlock;
        }
-        if (ip->i_di.di_entries > 2) {
+        if (ip->i_entries > 2) {
                error = -ENOTEMPTY;
                goto out_gunlock;
        }
@@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                        goto out_gunlock;
                if (S_ISDIR(nip->i_inode.i_mode)) {
-                        if (nip->i_di.di_entries < 2) {
+                        if (nip->i_entries < 2) {
                                if (gfs2_consist_inode(nip))
                                        gfs2_dinode_print(nip);
                                error = -EIO;
                                goto out_gunlock;
                        }
-                        if (nip->i_di.di_entries > 2) {
+                        if (nip->i_entries > 2) {
                                error = -ENOTEMPTY;
                                goto out_gunlock;
                        }
@@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                                error = -EINVAL;
                                goto out_gunlock;
                        }
-                        if (ndip->i_di.di_entries == (u32)-1) {
+                        if (ndip->i_entries == (u32)-1) {
                                error = -EFBIG;
                                goto out_gunlock;
                        }
@@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        int error;
-        if (attr->ia_size != ip->i_di.di_size) {
+        if (attr->ia_size != ip->i_disksize) {
                error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
                if (error)
                        return error;
@@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
        }
        error = gfs2_truncatei(ip, attr->ia_size);
-        if (error && (inode->i_size != ip->i_di.di_size))
+        if (error && (inode->i_size != ip->i_disksize))
-                i_size_write(inode, ip->i_di.di_size);
+                i_size_write(inode, ip->i_disksize);
        return error;
 }
@@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
        return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
 }
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+                       u64 start, u64 len)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int ret;
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
+                return ret;
+        mutex_lock(&inode->i_mutex);
+        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+        if (ret)
+                goto out;
+        if (gfs2_is_stuffed(ip)) {
+                u64 phys = ip->i_no_addr << inode->i_blkbits;
+                u64 size = i_size_read(inode);
+                u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+                            FIEMAP_EXTENT_DATA_INLINE;
+                phys += sizeof(struct gfs2_dinode);
+                phys += start;
+                if (start + len > size)
+                        len = size - start;
+                if (start < size)
+                        ret = fiemap_fill_next_extent(fieinfo, start, phys,
+                                                      len, flags);
+                if (ret == 1)
+                        ret = 0;
+        } else {
+                ret = __generic_block_fiemap(inode, fieinfo, start, len,
+                                             gfs2_block_map);
+        }
+        gfs2_glock_dq_uninit(&gh);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 const struct inode_operations gfs2_file_iops = {
        .permission = gfs2_permission,
        .setattr = gfs2_setattr,
@@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
 };
 const struct inode_operations gfs2_dir_iops = {
@@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
 };
 const struct inode_operations gfs2_symlink_iops = {
@@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = {
        .getxattr = gfs2_getxattr,
        .listxattr = gfs2_listxattr,
        .removexattr = gfs2_removexattr,
+        .fiemap = gfs2_fiemap,
 };
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
deleted file mode 100644
index 14b4b797622a..000000000000
--- a/fs/gfs2/ops_inode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_INODE_DOT_H__
-#define __OPS_INODE_DOT_H__
-#include <linux/fs.h>
-extern const struct inode_operations gfs2_file_iops;
-extern const struct inode_operations gfs2_dir_iops;
-extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations gfs2_dir_fops_nolock;
-extern void gfs2_set_inode_flags(struct inode *inode);
-#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index d5355d9b5926..320323d03479 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -28,7 +28,6 @@
 #include "inode.h"
 #include "log.h"
 #include "mount.h"
-#include "ops_super.h"
 #include "quota.h"
 #include "recovery.h"
 #include "rgrp.h"
@@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb)
        kthread_stop(sdp->sd_quotad_process);
        kthread_stop(sdp->sd_logd_process);
        kthread_stop(sdp->sd_recoverd_process);
-        while (sdp->sd_glockd_num--)
-                kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
        if (!(sb->s_flags & MS_RDONLY)) {
                error = gfs2_make_fs_ro(sdp);
@@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb)
        /*  At this point, we're through participating in the lockspace  */
        gfs2_sys_fs_del(sdp);
-        kfree(sdp);
 }
 /**
@@ -215,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
 }
 /**
- * gfs2_write_super_lockfs - prevent further writes to the filesystem
+ * gfs2_freeze - prevent further writes to the filesystem
 * @sb: the VFS structure for the filesystem
 *
 */
-static void gfs2_write_super_lockfs(struct super_block *sb)
+static int gfs2_freeze(struct super_block *sb)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        int error;
        if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return;
+                return -EINVAL;
        for (;;) {
                error = gfs2_freeze_fs(sdp);
@@ -246,17 +242,150 @@ static void gfs2_write_super_lockfs(struct super_block *sb)
                fs_err(sdp, "retrying...\n");
                msleep(1000);
        }
+        return 0;
 }
 /**
- * gfs2_unlockfs - reallow writes to the filesystem
+ * gfs2_unfreeze - reallow writes to the filesystem
 * @sb: the VFS structure for the filesystem
 *
 */
-static void gfs2_unlockfs(struct super_block *sb)
+static int gfs2_unfreeze(struct super_block *sb)
 {
        gfs2_unfreeze_fs(sb->s_fs_info);
+        return 0;
+}
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+                            struct gfs2_statfs_change_host *sc)
+{
+        gfs2_rgrp_verify(rgd);
+        sc->sc_total += rgd->rd_data;
+        sc->sc_free += rgd->rd_free;
+        sc->sc_dinodes += rgd->rd_dinodes;
+        return 0;
+}
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_holder ri_gh;
+        struct gfs2_rgrpd *rgd_next;
+        struct gfs2_holder *gha, *gh;
+        unsigned int slots = 64;
+        unsigned int x;
+        int done;
+        int error = 0, err;
+        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!gha)
+                return -ENOMEM;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto out;
+        rgd_next = gfs2_rgrpd_get_first(sdp);
+        for (;;) {
+                done = 1;
+                for (x = 0; x < slots; x++) {
+                        gh = gha + x;
+                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
+                                err = gfs2_glock_wait(gh);
+                                if (err) {
+                                        gfs2_holder_uninit(gh);
+                                        error = err;
+                                } else {
+                                        if (!error)
+                                                error = statfs_slow_fill(
+                                                        gh->gh_gl->gl_object, sc);
+                                        gfs2_glock_dq_uninit(gh);
+                                }
+                        }
+                        if (gh->gh_gl)
+                                done = 0;
+                        else if (rgd_next && !error) {
+                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
+                                                           LM_ST_SHARED,
+                                                           GL_ASYNC,
+                                                           gh);
+                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
+                                done = 0;
+                        }
+                        if (signal_pending(current))
+                                error = -ERESTARTSYS;
+                }
+                if (done)
+                        break;
+                yield();
+        }
+        gfs2_glock_dq_uninit(&ri_gh);
+out:
+        kfree(gha);
+        return error;
+}
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        spin_lock(&sdp->sd_statfs_spin);
+        *sc = *m_sc;
+        sc->sc_total += l_sc->sc_total;
+        sc->sc_free += l_sc->sc_free;
+        sc->sc_dinodes += l_sc->sc_dinodes;
+        spin_unlock(&sdp->sd_statfs_spin);
+        if (sc->sc_free < 0)
+                sc->sc_free = 0;
+        if (sc->sc_free > sc->sc_total)
+                sc->sc_free = sc->sc_total;
+        if (sc->sc_dinodes < 0)
+                sc->sc_dinodes = 0;
+        return 0;
 }
 /**
@@ -370,7 +499,6 @@ static void gfs2_clear_inode(struct inode *inode)
         */
        if (test_bit(GIF_USER, &ip->i_flags)) {
                ip->i_gl->gl_object = NULL;
-                gfs2_glock_schedule_for_reclaim(ip->i_gl);
                gfs2_glock_put(ip->i_gl);
                ip->i_gl = NULL;
                if (ip->i_iopen_gh.gh_gl) {
@@ -423,8 +551,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
                seq_printf(s, ",debug");
        if (args->ar_upgrade)
                seq_printf(s, ",upgrade");
-        if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
-                seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
        if (args->ar_posix_acl)
                seq_printf(s, ",acl");
        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
@@ -494,16 +620,16 @@ static void gfs2_delete_inode(struct inode *inode)
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
        error = gfs2_glock_nq(&ip->i_iopen_gh);
        if (error)
-                goto out_uninit;
+                goto out_truncate;
        if (S_ISDIR(inode->i_mode) &&
-            (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+            (ip->i_diskflags & GFS2_DIF_EXHASH)) {
                error = gfs2_dir_exhash_dealloc(ip);
                if (error)
                        goto out_unlock;
        }
-        if (ip->i_di.di_eattr) {
+        if (ip->i_eattr) {
                error = gfs2_ea_dealloc(ip);
                if (error)
                        goto out_unlock;
@@ -519,6 +645,7 @@ static void gfs2_delete_inode(struct inode *inode)
        if (error)
                goto out_unlock;
+out_truncate:
        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
        if (error)
                goto out_unlock;
@@ -527,8 +654,8 @@ static void gfs2_delete_inode(struct inode *inode)
        gfs2_trans_end(sdp);
 out_unlock:
-        gfs2_glock_dq(&ip->i_iopen_gh);
+        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
-out_uninit:
+                gfs2_glock_dq(&ip->i_iopen_gh);
        gfs2_holder_uninit(&ip->i_iopen_gh);
        gfs2_glock_dq_uninit(&gh);
        if (error && error != GLR_TRYFAILED)
@@ -563,8 +690,8 @@ const struct super_operations gfs2_super_ops = {
        .put_super              = gfs2_put_super,
        .write_super            = gfs2_write_super,
        .sync_fs                = gfs2_sync_fs,
-        .write_super_lockfs     = gfs2_write_super_lockfs,
+        .freeze_fs              = gfs2_freeze,
-        .unlockfs               = gfs2_unlockfs,
+        .unfreeze_fs            = gfs2_unfreeze,
        .statfs                 = gfs2_statfs,
        .remount_fs             = gfs2_remount_fs,
        .clear_inode            = gfs2_clear_inode,
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
deleted file mode 100644
index 442a274c6272..000000000000
--- a/fs/gfs2/ops_super.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_SUPER_DOT_H__
-#define __OPS_SUPER_DOT_H__
-#include <linux/fs.h>
-extern const struct super_operations gfs2_super_ops;
-#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e073f5144fa..b08d09696b3e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -46,6 +46,8 @@
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
        struct gfs2_quota_data *qd;
        int error;
-        qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
+        qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
        if (!qd)
                return -ENOMEM;
@@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
        return 0;
 fail:
-        kfree(qd);
+        kmem_cache_free(gfs2_quotad_cachep, qd);
        return error;
 }
@@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
                if (qd || !create) {
                        if (new_qd) {
                                gfs2_lvb_unhold(new_qd->qd_gl);
-                                kfree(new_qd);
+                                kmem_cache_free(gfs2_quotad_cachep, new_qd);
                        }
                        *qdp = qd;
                        return 0;
@@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
        if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
                return;
-        if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+        if (ip->i_diskflags & GFS2_DIF_SYSTEM)
                return;
        for (x = 0; x < al->al_qd_num; x++) {
@@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-        unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
        unsigned int x, slot = 0;
        unsigned int found = 0;
        u64 dblock;
        u32 extlen = 0;
        int error;
-        if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
+        if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
-            ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+            ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
                gfs2_consist_inode(ip);
                return -EIO;
        }
@@ -1195,7 +1197,7 @@ fail:
        return error;
 }
-void gfs2_quota_scan(struct gfs2_sbd *sdp)
+static void gfs2_quota_scan(struct gfs2_sbd *sdp)
 {
        struct gfs2_quota_data *qd, *safe;
        LIST_HEAD(dead);
@@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp)
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
                gfs2_lvb_unhold(qd->qd_gl);
-                kfree(qd);
+                kmem_cache_free(gfs2_quotad_cachep, qd);
        }
 }
@@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
                gfs2_assert_warn(sdp, !qd->qd_bh_count);
                gfs2_lvb_unhold(qd->qd_gl);
-                kfree(qd);
+                kmem_cache_free(gfs2_quotad_cachep, qd);
                spin_lock(&sdp->sd_quota_spin);
        }
@@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
        }
 }
+static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
+{
+        if (error == 0 || error == -EROFS)
+                return;
+        if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
+}
+static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
+                               int (*fxn)(struct gfs2_sbd *sdp),
+                               unsigned long t, unsigned long *timeo,
+                               unsigned int *new_timeo)
+{
+        if (t >= *timeo) {
+                int error = fxn(sdp);
+                quotad_error(sdp, msg, error);
+                *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
+        } else {
+                *timeo -= t;
+        }
+}
+static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
+{
+        struct gfs2_inode *ip;
+        while(1) {
+                ip = NULL;
+                spin_lock(&sdp->sd_trunc_lock);
+                if (!list_empty(&sdp->sd_trunc_list)) {
+                        ip = list_entry(sdp->sd_trunc_list.next,
+                                        struct gfs2_inode, i_trunc_list);
+                        list_del_init(&ip->i_trunc_list);
+                }
+                spin_unlock(&sdp->sd_trunc_lock);
+                if (ip == NULL)
+                        return;
+                gfs2_glock_finish_truncate(ip);
+        }
+}
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_quotad(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        struct gfs2_tune *tune = &sdp->sd_tune;
+        unsigned long statfs_timeo = 0;
+        unsigned long quotad_timeo = 0;
+        unsigned long t = 0;
+        DEFINE_WAIT(wait);
+        int empty;
+        while (!kthread_should_stop()) {
+                /* Update the master statfs file */
+                quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+                                   &statfs_timeo, &tune->gt_statfs_quantum);
+                /* Update quota file */
+                quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+                                   &quotad_timeo, &tune->gt_quota_quantum);
+                /* FIXME: This should be turned into a shrinker */
+                gfs2_quota_scan(sdp);
+                /* Check for & recover partially truncated inodes */
+                quotad_check_trunc_list(sdp);
+                if (freezing(current))
+                        refrigerator();
+                t = min(quotad_timeo, statfs_timeo);
+                prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
+                spin_lock(&sdp->sd_trunc_lock);
+                empty = list_empty(&sdp->sd_trunc_list);
+                spin_unlock(&sdp->sd_trunc_lock);
+                if (empty)
+                        t -= schedule_timeout(t);
+                else
+                        t = 0;
+                finish_wait(&sdp->sd_quota_wait, &wait);
+        }
+        return 0;
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 3b7f4b0e5dfe..cec9032be97d 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,22 +15,22 @@ struct gfs2_sbd;
 #define NO_QUOTA_CHANGE ((u32)-1)
-int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unhold(struct gfs2_inode *ip);
+extern void gfs2_quota_unhold(struct gfs2_inode *ip);
-int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unlock(struct gfs2_inode *ip);
+extern void gfs2_quota_unlock(struct gfs2_inode *ip);
-int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
-                       u32 uid, u32 gid);
+                              u32 uid, u32 gid);
-int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
-int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
-int gfs2_quota_init(struct gfs2_sbd *sdp);
+extern int gfs2_quota_init(struct gfs2_sbd *sdp);
-void gfs2_quota_scan(struct gfs2_sbd *sdp);
+extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
-void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quotad(void *data);
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 {
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index d5e91f4f6a0b..efd09c3d2b26 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -583,13 +585,35 @@ fail:
        return error;
 }
+static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+        struct gfs2_jdesc *jd;
+        int found = 0;
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (jd->jd_dirty) {
+                        jd->jd_dirty = 0;
+                        found = 1;
+                        break;
+                }
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+        if (!found)
+                jd = NULL;
+        return jd;
+}
 /**
 * gfs2_check_journals - Recover any dirty journals
 * @sdp: the filesystem
 *
 */
-void gfs2_check_journals(struct gfs2_sbd *sdp)
+static void gfs2_check_journals(struct gfs2_sbd *sdp)
 {
        struct gfs2_jdesc *jd;
@@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp)
        }
 }
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+int gfs2_recoverd(void *data)
+{
+        struct gfs2_sbd *sdp = data;
+        unsigned long t;
+        while (!kthread_should_stop()) {
+                gfs2_check_journals(sdp);
+                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
+                if (freezing(current))
+                        refrigerator();
+                schedule_timeout_interruptible(t);
+        }
+        return 0;
+}
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index f7235e61c723..a8218ea15b57 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
                *blk = 0;
 }
-int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
                           struct buffer_head **bh);
-int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
-int gfs2_find_jhead(struct gfs2_jdesc *jd,
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
                    struct gfs2_log_header_host *head);
-int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-void gfs2_check_journals(struct gfs2_sbd *sdp);
+extern int gfs2_recoverd(void *data);
 #endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d90fb253505..8b01c635d925 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
                                                  bi->bi_len, x);
        }
-        if (count[0] != rgd->rd_rg.rg_free) {
+        if (count[0] != rgd->rd_free) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "free data mismatch:  %u != %u\n",
-                               count[0], rgd->rd_rg.rg_free);
+                               count[0], rgd->rd_free);
                return;
        }
-        tmp = rgd->rd_data -
+        tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
-                rgd->rd_rg.rg_free -
-                rgd->rd_rg.rg_dinodes;
        if (count[1] + count[2] != tmp) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used data mismatch:  %u != %u\n",
@@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
                return;
        }
-        if (count[3] != rgd->rd_rg.rg_dinodes) {
+        if (count[3] != rgd->rd_dinodes) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-                               count[3], rgd->rd_rg.rg_dinodes);
+                               count[3], rgd->rd_dinodes);
                return;
        }
@@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
        for (rgrps = 0;; rgrps++) {
                loff_t pos = rgrps * sizeof(struct gfs2_rindex);
-                if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size)
+                if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
                        break;
                error = gfs2_internal_read(ip, &ra_state, buf, &pos,
                                           sizeof(struct gfs2_rindex));
@@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct inode *inode = &ip->i_inode;
        struct file_ra_state ra_state;
-        u64 rgrp_count = ip->i_di.di_size;
+        u64 rgrp_count = ip->i_disksize;
        int error;
        if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
@@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
        for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
                /* Ignore partials */
                if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-                    ip->i_di.di_size)
+                    ip->i_disksize)
                        break;
                error = read_rindex_entry(ip, &ra_state);
                if (error) {
@@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 {
        const struct gfs2_rgrp *str = buf;
-        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
        u32 rg_flags;
        rg_flags = be32_to_cpu(str->rg_flags);
@@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
                rgd->rd_flags |= GFS2_RDF_NOALLOC;
        else
                rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
-        rg->rg_free = be32_to_cpu(str->rg_free);
+        rgd->rd_free = be32_to_cpu(str->rg_free);
-        rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
+        rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
-        rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+        rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
 }
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
        struct gfs2_rgrp *str = buf;
-        struct gfs2_rgrp_host *rg = &rgd->rd_rg;
        u32 rg_flags = 0;
        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
                rg_flags |= GFS2_RGF_NOALLOC;
        str->rg_flags = cpu_to_be32(rg_flags);
-        str->rg_free = cpu_to_be32(rg->rg_free);
+        str->rg_free = cpu_to_be32(rgd->rd_free);
-        str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+        str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
        str->__pad = cpu_to_be32(0);
-        str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+        str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
        memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
 }
@@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
        }
        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        rgd->rd_free_clone = rgd->rd_free;
        rgd->rd_bh_count++;
        spin_unlock(&sdp->sd_rindex_spin);
@@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
        }
        spin_lock(&sdp->sd_rindex_spin);
-        rgd->rd_free_clone = rgd->rd_rg.rg_free;
+        rgd->rd_free_clone = rgd->rd_free;
        spin_unlock(&sdp->sd_rindex_spin);
 }
@@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
        block = rgd->rd_data0 + blk;
        ip->i_goal = block;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
+        gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
-        rgd->rd_rg.rg_free -= *n;
+        rgd->rd_free -= *n;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        block = rgd->rd_data0 + blk;
-        gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+        gfs2_assert_withdraw(sdp, rgd->rd_free);
-        rgd->rd_rg.rg_free--;
+        rgd->rd_free--;
-        rgd->rd_rg.rg_dinodes++;
+        rgd->rd_dinodes++;
-        *generation = rgd->rd_rg.rg_igeneration++;
+        *generation = rgd->rd_igeneration++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        if (!rgd)
                return;
-        rgd->rd_rg.rg_free += blen;
+        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        if (!rgd)
                return;
-        rgd->rd_rg.rg_free += blen;
+        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
                return;
        gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
-        if (!rgd->rd_rg.rg_dinodes)
+        if (!rgd->rd_dinodes)
                gfs2_consist_rgrpd(rgd);
-        rgd->rd_rg.rg_dinodes--;
+        rgd->rd_dinodes--;
-        rgd->rd_rg.rg_free++;
+        rgd->rd_free++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c3ba3d9d0aac..141b781f2fcc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -34,76 +34,6 @@
 #include "util.h"
 /**
- * gfs2_jindex_hold - Grab a lock on the jindex
- * @sdp: The GFS2 superblock
- * @ji_gh: the holder for the jindex glock
- *
- * This is very similar to the gfs2_rindex_hold() function, except that
- * in general we hold the jindex lock for longer periods of time and
- * we grab it far less frequently (in general) then the rgrp lock.
- *
- * Returns: errno
- */
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
-{
-        struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
-        struct qstr name;
-        char buf[20];
-        struct gfs2_jdesc *jd;
-        int error;
-        name.name = buf;
-        mutex_lock(&sdp->sd_jindex_mutex);
-        for (;;) {
-                error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
-                if (error)
-                        break;
-                name.len = sprintf(buf, "journal%u", sdp->sd_journals);
-                name.hash = gfs2_disk_hash(name.name, name.len);
-                error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
-                if (error == -ENOENT) {
-                        error = 0;
-                        break;
-                }
-                gfs2_glock_dq_uninit(ji_gh);
-                if (error)
-                        break;
-                error = -ENOMEM;
-                jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
-                if (!jd)
-                        break;
-                INIT_LIST_HEAD(&jd->extent_list);
-                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
-                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
-                        if (!jd->jd_inode)
-                                error = -ENOENT;
-                        else
-                                error = PTR_ERR(jd->jd_inode);
-                        kfree(jd);
-                        break;
-                }
-                spin_lock(&sdp->sd_jindex_spin);
-                jd->jd_jid = sdp->sd_journals++;
-                list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
-                spin_unlock(&sdp->sd_jindex_spin);
-        }
-        mutex_unlock(&sdp->sd_jindex_mutex);
-        return error;
-}
-/**
 * gfs2_jindex_free - Clear all the journal index information
 * @sdp: The GFS2 superblock
 *
@@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
        return jd;
 }
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
-{
-        struct gfs2_jdesc *jd;
-        spin_lock(&sdp->sd_jindex_spin);
-        jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
-        if (jd)
-                jd->jd_dirty = 1;
-        spin_unlock(&sdp->sd_jindex_spin);
-}
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
-{
-        struct gfs2_jdesc *jd;
-        int found = 0;
-        spin_lock(&sdp->sd_jindex_spin);
-        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-                if (jd->jd_dirty) {
-                        jd->jd_dirty = 0;
-                        found = 1;
-                        break;
-                }
-        }
-        spin_unlock(&sdp->sd_jindex_spin);
-        if (!found)
-                jd = NULL;
-        return jd;
-}
 int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
        int ar;
        int error;
-        if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
+        if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
-            (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+            (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
                gfs2_consist_inode(ip);
                return -EIO;
        }
-        jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+        jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
-        error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+        error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
        if (!error && ar) {
                gfs2_consist_inode(ip);
                error = -EIO;
@@ -423,137 +320,6 @@ out:
        return error;
 }
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-        spin_lock(&sdp->sd_statfs_spin);
-        *sc = *m_sc;
-        sc->sc_total += l_sc->sc_total;
-        sc->sc_free += l_sc->sc_free;
-        sc->sc_dinodes += l_sc->sc_dinodes;
-        spin_unlock(&sdp->sd_statfs_spin);
-        if (sc->sc_free < 0)
-                sc->sc_free = 0;
-        if (sc->sc_free > sc->sc_total)
-                sc->sc_free = sc->sc_total;
-        if (sc->sc_dinodes < 0)
-                sc->sc_dinodes = 0;
-        return 0;
-}
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-                            struct gfs2_statfs_change_host *sc)
-{
-        gfs2_rgrp_verify(rgd);
-        sc->sc_total += rgd->rd_data;
-        sc->sc_free += rgd->rd_rg.rg_free;
-        sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
-        return 0;
-}
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_holder ri_gh;
-        struct gfs2_rgrpd *rgd_next;
-        struct gfs2_holder *gha, *gh;
-        unsigned int slots = 64;
-        unsigned int x;
-        int done;
-        int error = 0, err;
-        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-        if (!gha)
-                return -ENOMEM;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                goto out;
-        rgd_next = gfs2_rgrpd_get_first(sdp);
-        for (;;) {
-                done = 1;
-                for (x = 0; x < slots; x++) {
-                        gh = gha + x;
-                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
-                                err = gfs2_glock_wait(gh);
-                                if (err) {
-                                        gfs2_holder_uninit(gh);
-                                        error = err;
-                                } else {
-                                        if (!error)
-                                                error = statfs_slow_fill(
-                                                        gh->gh_gl->gl_object, sc);
-                                        gfs2_glock_dq_uninit(gh);
-                                }
-                        }
-                        if (gh->gh_gl)
-                                done = 0;
-                        else if (rgd_next && !error) {
-                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
-                                                           LM_ST_SHARED,
-                                                           GL_ASYNC,
-                                                           gh);
-                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
-                                done = 0;
-                        }
-                        if (signal_pending(current))
-                                error = -ERESTARTSYS;
-                }
-                if (done)
-                        break;
-                yield();
-        }
-        gfs2_glock_dq_uninit(&ri_gh);
-out:
-        kfree(gha);
-        return error;
-}
 struct lfcc {
        struct list_head list;
        struct gfs2_holder gh;
@@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
        struct gfs2_log_header_host lh;
        int error;
-        error = gfs2_jindex_hold(sdp, &ji_gh);
-        if (error)
-                return error;
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
                if (!lfcc) {
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 50a4c9b1215e..f6b8b00ad881 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -10,6 +10,8 @@
 #ifndef __SUPER_DOT_H__
 #define __SUPER_DOT_H__
+#include <linux/fs.h>
+#include <linux/dcache.h>
 #include "incore.h"
 void gfs2_lm_unmount(struct gfs2_sbd *sdp);
@@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
        return x;
 }
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
 void gfs2_jindex_free(struct gfs2_sbd *sdp);
 struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
 int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
@@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
                        s64 total, s64 free, s64 dinodes);
 int gfs2_statfs_sync(struct gfs2_sbd *sdp);
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
 int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+extern const struct export_operations gfs2_export_ops;
+extern const struct super_operations gfs2_super_ops;
+extern struct dentry_operations gfs2_dops;
 #endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7e1879f1a02c..26c1fa777a95 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,9 +26,6 @@
 #include "quota.h"
 #include "util.h"
-char *gfs2_sys_margs;
-spinlock_t gfs2_sys_margs_lock;
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -263,7 +260,6 @@ ARGS_ATTR(localcaching,    "%d\n");
 ARGS_ATTR(localflocks,     "%d\n");
 ARGS_ATTR(debug,           "%d\n");
 ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(num_glockd,      "%u\n");
 ARGS_ATTR(posix_acl,       "%d\n");
 ARGS_ATTR(quota,           "%u\n");
 ARGS_ATTR(suiddir,         "%d\n");
@@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = {
        &args_attr_localflocks.attr,
        &args_attr_debug.attr,
        &args_attr_upgrade.attr,
-        &args_attr_num_glockd.attr,
        &args_attr_posix_acl.attr,
        &args_attr_quota.attr,
        &args_attr_suiddir.attr,
@@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = {
 };
 /*
- * display counters from superblock
- */
-struct counters_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-#define COUNTERS_ATTR(name, fmt)                                            \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-        return snprintf(buf, PAGE_SIZE, fmt,                                \
-                        (unsigned int)atomic_read(&sdp->sd_##name));        \
-}                                                                           \
-static struct counters_attr counters_attr_##name = __ATTR_RO(name)
-COUNTERS_ATTR(reclaimed,        "%u\n");
-static struct attribute *counters_attrs[] = {
-        &counters_attr_reclaimed.attr,
-        NULL,
-};
-/*
 * get and set struct gfs2_tune fields
 */
@@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-TUNE_ATTR(demote_secs, 0);
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
 TUNE_ATTR_DAEMON(logd_secs, logd_process);
-TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
-        &tune_attr_demote_secs.attr,
        &tune_attr_incore_log_blocks.attr,
        &tune_attr_log_flush_secs.attr,
        &tune_attr_quota_warn_period.attr,
@@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_statfs_quantum.attr,
        &tune_attr_recoverd_secs.attr,
        &tune_attr_logd_secs.attr,
-        &tune_attr_quotad_secs.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
        NULL,
@@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = {
        .attrs = lockstruct_attrs,
 };
-static struct attribute_group counters_group = {
-        .name = "counters",
-        .attrs = counters_attrs,
-};
 static struct attribute_group args_group = {
        .name = "args",
        .attrs = args_attrs,
@@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
        if (error)
                goto fail_reg;
-        error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
-        if (error)
-                goto fail_lockstruct;
        error = sysfs_create_group(&sdp->sd_kobj, &args_group);
        if (error)
-                goto fail_counters;
+                goto fail_lockstruct;
        error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
        if (error)
@@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 fail_args:
        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_counters:
-        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
 fail_lockstruct:
        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
@@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-        sysfs_remove_group(&sdp->sd_kobj, &counters_group);
        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
        kobject_put(&sdp->sd_kobj);
 }
+static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
+                       struct kobj_uevent_env *env)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
+        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+        return 0;
+}
+static struct kset_uevent_ops gfs2_uevent_ops = {
+        .uevent = gfs2_uevent,
+};
 int gfs2_sys_init(void)
 {
-        gfs2_sys_margs = NULL;
+        gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
-        spin_lock_init(&gfs2_sys_margs_lock);
-        gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
        if (!gfs2_kset)
                return -ENOMEM;
        return 0;
@@ -515,7 +482,6 @@ int gfs2_sys_init(void)
 void gfs2_sys_uninit(void)
 {
-        kfree(gfs2_sys_margs);
        kset_unregister(gfs2_kset);
 }
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 1ca8cdac5304..e94560e836d7 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -13,10 +13,6 @@
 #include <linux/spinlock.h>
 struct gfs2_sbd;
-/* Allow args to be passed to GFS2 when using an initial ram disk */
-extern char *gfs2_sys_margs;
-extern spinlock_t gfs2_sys_margs_lock;
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d31e355c61fb..374f50e95496 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
+struct kmem_cache *gfs2_quotad_cachep __read_mostly;
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 7f48576289c9..33e96b0ce9ab 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
 extern struct kmem_cache *gfs2_rgrpd_cachep;
+extern struct kmem_cache *gfs2_quotad_cachep;
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
                                           unsigned int *p)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a31451ac170..5c538e0ec14b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = __grab_cache_page(mapping, index);
+        *pagep = grab_cache_page_write_begin(mapping, index, flags);
        if (!*pagep)
                return -ENOMEM;
        return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3aceb..6903d37af037 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
        for (;;) {
                struct page *page;
                unsigned long nr, ret;
+                int ra;
                /* nr is the maximum number of bytes to copy from this page */
                nr = huge_page_size(h);
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
                         */
                        ret = len < nr ? len : nr;
                        if (clear_user(buf, ret))
-                                ret = -EFAULT;
+                                ra = -EFAULT;
+                        else
+                                ra = 0;
                } else {
                        /*
                         * We have the page, copy it to user space buffer.
                         */
-                        ret = hugetlbfs_read_actor(page, offset, buf, len, nr);
+                        ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
+                        ret = ra;
                }
-                if (ret < 0) {
+                if (ra < 0) {
                        if (retval == 0)
-                                retval = ret;
+                                retval = ra;
                        if (page)
                                page_cache_release(page);
                        goto out;
@@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                inode->i_mode = mode;
                inode->i_uid = uid;
                inode->i_gid = gid;
-                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 0487ddba1397..913ab2d9a5d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
 #include <linux/mount.h>
+#include <linux/async.h>
 /*
 * This is needed for the following functions:
@@ -108,84 +109,102 @@ static void wake_up_inode(struct inode *inode)
        wake_up_bit(&inode->i_state, __I_LOCK);
 }
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * inode_init_always - perform inode structure intialisation
+ * @sb: superblock inode belongs to
+ * @inode: inode to initialise
+ *
+ * These are initializations that need to be done on every inode
+ * allocation as the fields are not initialised by slab allocation.
+ */
+struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
-        struct inode *inode;
-        if (sb->s_op->alloc_inode)
+        struct address_space * const mapping = &inode->i_data;
-                inode = sb->s_op->alloc_inode(sb);
-        else
+        inode->i_sb = sb;
-                inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+        inode->i_blkbits = sb->s_blocksize_bits;
+        inode->i_flags = 0;
-        if (inode) {
+        atomic_set(&inode->i_count, 1);
-                struct address_space * const mapping = &inode->i_data;
+        inode->i_op = &empty_iops;
+        inode->i_fop = &empty_fops;
-                inode->i_sb = sb;
+        inode->i_nlink = 1;
-                inode->i_blkbits = sb->s_blocksize_bits;
+        inode->i_uid = 0;
-                inode->i_flags = 0;
+        inode->i_gid = 0;
-                atomic_set(&inode->i_count, 1);
+        atomic_set(&inode->i_writecount, 0);
-                inode->i_op = &empty_iops;
+        inode->i_size = 0;
-                inode->i_fop = &empty_fops;
+        inode->i_blocks = 0;
-                inode->i_nlink = 1;
+        inode->i_bytes = 0;
-                atomic_set(&inode->i_writecount, 0);
+        inode->i_generation = 0;
-                inode->i_size = 0;
-                inode->i_blocks = 0;
-                inode->i_bytes = 0;
-                inode->i_generation = 0;
 #ifdef CONFIG_QUOTA
-                memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+        memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
 #endif
-                inode->i_pipe = NULL;
+        inode->i_pipe = NULL;
-                inode->i_bdev = NULL;
+        inode->i_bdev = NULL;
-                inode->i_cdev = NULL;
+        inode->i_cdev = NULL;
-                inode->i_rdev = 0;
+        inode->i_rdev = 0;
-                inode->dirtied_when = 0;
+        inode->dirtied_when = 0;
-                if (security_inode_alloc(inode)) {
+        if (security_inode_alloc(inode)) {
-                        if (inode->i_sb->s_op->destroy_inode)
+                if (inode->i_sb->s_op->destroy_inode)
-                                inode->i_sb->s_op->destroy_inode(inode);
+                        inode->i_sb->s_op->destroy_inode(inode);
-                        else
+                else
-                                kmem_cache_free(inode_cachep, (inode));
+                        kmem_cache_free(inode_cachep, (inode));
-                        return NULL;
+                return NULL;
-                }
+        }
-                spin_lock_init(&inode->i_lock);
+        spin_lock_init(&inode->i_lock);
-                lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
-                mutex_init(&inode->i_mutex);
+        mutex_init(&inode->i_mutex);
-                lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+        lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
-                init_rwsem(&inode->i_alloc_sem);
+        init_rwsem(&inode->i_alloc_sem);
-                lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+        lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
-                mapping->a_ops = &empty_aops;
+        mapping->a_ops = &empty_aops;
-                mapping->host = inode;
+        mapping->host = inode;
-                mapping->flags = 0;
+        mapping->flags = 0;
-                mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
-                mapping->assoc_mapping = NULL;
+        mapping->assoc_mapping = NULL;
-                mapping->backing_dev_info = &default_backing_dev_info;
+        mapping->backing_dev_info = &default_backing_dev_info;
-                mapping->writeback_index = 0;
+        mapping->writeback_index = 0;
-                /*
+        /*
-                 * If the block_device provides a backing_dev_info for client
+         * If the block_device provides a backing_dev_info for client
-                 * inodes then use that.  Otherwise the inode share the bdev's
+         * inodes then use that.  Otherwise the inode share the bdev's
-                 * backing_dev_info.
+         * backing_dev_info.
-                 */
+         */
-                if (sb->s_bdev) {
+        if (sb->s_bdev) {
-                        struct backing_dev_info *bdi;
+                struct backing_dev_info *bdi;
-                        bdi = sb->s_bdev->bd_inode_backing_dev_info;
+                bdi = sb->s_bdev->bd_inode_backing_dev_info;
-                        if (!bdi)
+                if (!bdi)
-                                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+                        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                        mapping->backing_dev_info = bdi;
+                mapping->backing_dev_info = bdi;
-                }
-                inode->i_private = NULL;
-                inode->i_mapping = mapping;
        }
+        inode->i_private = NULL;
+        inode->i_mapping = mapping;
        return inode;
 }
+EXPORT_SYMBOL(inode_init_always);
+static struct inode *alloc_inode(struct super_block *sb)
+{
+        struct inode *inode;
+        if (sb->s_op->alloc_inode)
+                inode = sb->s_op->alloc_inode(sb);
+        else
+                inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+        if (inode)
+                return inode_init_always(sb, inode);
+        return NULL;
+}
 void destroy_inode(struct inode *inode) 
 {
@@ -196,6 +215,7 @@ void destroy_inode(struct inode *inode)
        else
                kmem_cache_free(inode_cachep, (inode));
 }
+EXPORT_SYMBOL(destroy_inode);
 /*
@@ -534,12 +554,55 @@ repeat:
        return node ? inode : NULL;
 }
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+        unsigned long tmp;
+        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                        L1_CACHE_BYTES;
+        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+        return tmp & I_HASHMASK;
+}
+static inline void
+__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
+                        struct inode *inode)
+{
+        inodes_stat.nr_inodes++;
+        list_add(&inode->i_list, &inode_in_use);
+        list_add(&inode->i_sb_list, &sb->s_inodes);
+        if (head)
+                hlist_add_head(&inode->i_hash, head);
+}
+/**
+ * inode_add_to_lists - add a new inode to relevant lists
+ * @sb: superblock inode belongs to
+ * @inode: inode to mark in use
+ *
+ * When an inode is allocated it needs to be accounted for, added to the in use
+ * list, the owning superblock and the inode hash. This needs to be done under
+ * the inode_lock, so export a function to do this rather than the inode lock
+ * itself. We calculate the hash list to add to here so it is all internal
+ * which requires the caller to have already set up the inode number in the
+ * inode to add.
+ */
+void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+{
+        struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+        spin_lock(&inode_lock);
+        __inode_add_to_lists(sb, head, inode);
+        spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_add_to_lists);
 /**
 *      new_inode       - obtain an inode
 *      @sb: superblock
 *
 *      Allocates a new inode for given superblock. The default gfp_mask
- *      for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
+ *      for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
 *      If HIGHMEM pages are unsuitable or it is known that pages allocated
 *      for the page cache are not reclaimable or migratable,
 *      mapping_set_gfp_mask() must be called with suitable flags on the
@@ -561,9 +624,7 @@ struct inode *new_inode(struct super_block *sb)
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
-                inodes_stat.nr_inodes++;
+                __inode_add_to_lists(sb, NULL, inode);
-                list_add(&inode->i_list, &inode_in_use);
-                list_add(&inode->i_sb_list, &sb->s_inodes);
                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
@@ -622,10 +683,7 @@ static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *h
                        if (set(inode, data))
                                goto set_failed;
-                        inodes_stat.nr_inodes++;
+                        __inode_add_to_lists(sb, head, inode);
-                        list_add(&inode->i_list, &inode_in_use);
-                        list_add(&inode->i_sb_list, &sb->s_inodes);
-                        hlist_add_head(&inode->i_hash, head);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
@@ -671,10 +729,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                        inodes_stat.nr_inodes++;
+                        __inode_add_to_lists(sb, head, inode);
-                        list_add(&inode->i_list, &inode_in_use);
-                        list_add(&inode->i_sb_list, &sb->s_inodes);
-                        hlist_add_head(&inode->i_hash, head);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
@@ -698,16 +753,6 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
        return inode;
 }
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-        unsigned long tmp;
-        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                        L1_CACHE_BYTES;
-        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-        return tmp & I_HASHMASK;
-}
 /**
 *      iunique - get a unique inode number
 *      @sb: superblock
@@ -990,6 +1035,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 EXPORT_SYMBOL(iget_locked);
+int insert_inode_locked(struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        ino_t ino = inode->i_ino;
+        struct hlist_head *head = inode_hashtable + hash(sb, ino);
+        struct inode *old;
+        inode->i_state |= I_LOCK|I_NEW;
+        while (1) {
+                spin_lock(&inode_lock);
+                old = find_inode_fast(sb, head, ino);
+                if (likely(!old)) {
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode_lock);
+                        return 0;
+                }
+                __iget(old);
+                spin_unlock(&inode_lock);
+                wait_on_inode(old);
+                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                        iput(old);
+                        return -EBUSY;
+                }
+                iput(old);
+        }
+}
+EXPORT_SYMBOL(insert_inode_locked);
+int insert_inode_locked4(struct inode *inode, unsigned long hashval,
+                int (*test)(struct inode *, void *), void *data)
+{
+        struct super_block *sb = inode->i_sb;
+        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+        struct inode *old;
+        inode->i_state |= I_LOCK|I_NEW;
+        while (1) {
+                spin_lock(&inode_lock);
+                old = find_inode(sb, head, test, data);
+                if (likely(!old)) {
+                        hlist_add_head(&inode->i_hash, head);
+                        spin_unlock(&inode_lock);
+                        return 0;
+                }
+                __iget(old);
+                spin_unlock(&inode_lock);
+                wait_on_inode(old);
+                if (unlikely(!hlist_unhashed(&old->i_hash))) {
+                        iput(old);
+                        return -EBUSY;
+                }
+                iput(old);
+        }
+}
+EXPORT_SYMBOL(insert_inode_locked4);
 /**
 *      __insert_inode_hash - hash an inode
 *      @inode: unhashed inode
@@ -1292,6 +1396,7 @@ int inode_wait(void *word)
        schedule();
        return 0;
 }
+EXPORT_SYMBOL(inode_wait);
 /*
 * If we try to find an inode in the inode hash while it is being
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 43e8b2c0664b..20b0a8a24c6b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 #define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
 #define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
-/*
+/**
+ * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
 * @inode - the inode to map
 * @arg - the pointer to userspace where we copy everything to
 * @get_block - the fs's get_block function
@@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 *
 * If it is possible to have data blocks beyond a hole past @inode->i_size, then
 * please do not use this function, it will stop at the first unmapped block
- * beyond i_size
+ * beyond i_size.
+ *
+ * If you use this function directly, you need to do your own locking. Use
+ * generic_block_fiemap if you want the locking done for you.
 */
-int generic_block_fiemap(struct inode *inode,
-                         struct fiemap_extent_info *fieinfo, u64 start,
+int __generic_block_fiemap(struct inode *inode,
-                         u64 len, get_block_t *get_block)
+                           struct fiemap_extent_info *fieinfo, u64 start,
+                           u64 len, get_block_t *get_block)
 {
        struct buffer_head tmp;
        unsigned int start_blk;
@@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode,
        start_blk = logical_to_blk(inode, start);
-        /* guard against change */
-        mutex_lock(&inode->i_mutex);
        length = (long long)min_t(u64, len, i_size_read(inode));
        map_len = length;
@@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode,
                cond_resched();
        } while (1);
-        mutex_unlock(&inode->i_mutex);
        /* if ret is 1 then we just hit the end of the extent array */
        if (ret == 1)
                ret = 0;
        return ret;
 }
+EXPORT_SYMBOL(__generic_block_fiemap);
+/**
+ * generic_block_fiemap - FIEMAP for block based inodes
+ * @inode: The inode to map
+ * @fieinfo: The mapping information
+ * @start: The initial block to map
+ * @len: The length of the extect to attempt to map
+ * @get_block: The block mapping function for the fs
+ *
+ * Calls __generic_block_fiemap to map the inode, after taking
+ * the inode's mutex lock.
+ */
+int generic_block_fiemap(struct inode *inode,
+                         struct fiemap_extent_info *fieinfo, u64 start,
+                         u64 len, get_block_t *get_block)
+{
+        int ret;
+        mutex_lock(&inode->i_mutex);
+        ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 EXPORT_SYMBOL(generic_block_fiemap);
 #endif  /*  CONFIG_BLOCK  */
@@ -415,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
        return error;
 }
+static int ioctl_fsfreeze(struct file *filp)
+{
+        struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /* If filesystem doesn't support freeze feature, return. */
+        if (sb->s_op->freeze_fs == NULL)
+                return -EOPNOTSUPP;
+        /* If a blockdevice-backed filesystem isn't specified, return. */
+        if (sb->s_bdev == NULL)
+                return -EINVAL;
+        /* Freeze */
+        sb = freeze_bdev(sb->s_bdev);
+        if (IS_ERR(sb))
+                return PTR_ERR(sb);
+        return 0;
+}
+static int ioctl_fsthaw(struct file *filp)
+{
+        struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+        if (sb->s_bdev == NULL)
+                return -EINVAL;
+        /* Thaw */
+        return thaw_bdev(sb->s_bdev, sb);
+}
 /*
 * When you add any new common ioctls to the switches above and below
 * please update compat_sys_ioctl() too.
@@ -462,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                } else
                        error = -ENOTTY;
                break;
+        case FIFREEZE:
+                error = ioctl_fsfreeze(filp);
+                break;
+        case FITHAW:
+                error = ioctl_fsthaw(filp);
+                break;
        default:
                if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
                        error = file_ioctl(filp, cmd, arg);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a2..1a39ac370942 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
-static int set_task_ioprio(struct task_struct *task, int ioprio)
+int set_task_ioprio(struct task_struct *task, int ioprio)
 {
        int err;
        struct io_context *ioc;
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
        task_unlock(task);
        return err;
 }
+EXPORT_SYMBOL_GPL(set_task_ioprio);
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 {
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505b..6147ec3643a0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
        }
        sbi->s_joliet_level = joliet_level;
-        /* check the root inode */
-        if (!inode->i_op)
-                goto out_bad_root;
        /* Make sure the root inode is a directory */
        if (!S_ISDIR(inode->i_mode)) {
                printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
        /*
         * Display error messages and free resources.
         */
-out_bad_root:
-        printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
 out_iput:
        iput(inode);
        goto out_no_inode;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c51..3fbffb1ea714 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
        int flags;
        int err;
        unsigned long blocknr;
+        ktime_t start_time;
+        u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+        start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
+        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+        /*
+         * weight the commit time higher than the average time so we don't
+         * react too strongly to vast changes in commit time
+         */
+        if (likely(journal->j_average_commit_time))
+                journal->j_average_commit_time = (commit_time*3 +
+                                journal->j_average_commit_time) / 4;
+        else
+                journal->j_average_commit_time = commit_time;
        spin_unlock(&journal->j_state_lock);
        if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c8808..e6a117431277 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
@@ -752,7 +754,6 @@ out:
 * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
 * @handle: transaction to add buffer modifications to
 * @bh:     bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
 *
 * Returns an error code or 0 on success.
 *
@@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int old_handle_count, err;
+        int err;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle)
         * on IO anyway.  Speeds up many-threaded, many-dir operations
         * by 30x or more...
         *
+         * We try and optimize the sleep time against what the underlying disk
+         * can do, instead of having a static sleep time.  This is usefull for
+         * the case where our storage is so fast that it is more optimal to go
+         * ahead and force a flush and wait for the transaction to be committed
+         * than it is to wait for an arbitrary amount of time for new writers to
+         * join the transaction.  We acheive this by measuring how long it takes
+         * to commit a transaction, and compare it with how long this
+         * transaction has been running, and if run time < commit time then we
+         * sleep for the delta and commit.  This greatly helps super fast disks
+         * that would see slowdowns as more threads started doing fsyncs.
+         *
         * But don't do this if this process was the most recent one to
         * perform a synchronous write.  We do this to detect the case where a
         * single process is doing a stream of sync writes.  No point in waiting
@@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle)
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+                u64 commit_time, trans_time;
                journal->j_last_sync_writer = pid;
-                do {
-                        old_handle_count = transaction->t_handle_count;
+                spin_lock(&journal->j_state_lock);
-                        schedule_timeout_uninterruptible(1);
+                commit_time = journal->j_average_commit_time;
-                } while (old_handle_count != transaction->t_handle_count);
+                spin_unlock(&journal->j_state_lock);
+                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                   transaction->t_start_time));
+                commit_time = min_t(u64, commit_time,
+                                    1000*jiffies_to_usecs(1));
+                if (trans_time < commit_time) {
+                        ktime_t expires = ktime_add_ns(ktime_get(),
+                                                       commit_time);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+                }
        }
        current->journal_info = NULL;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe920..17159cacbd9e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
        return ret;
 }
-#define NR_BATCH        64
 static void
-__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+__flush_batch(journal_t *journal, int *batch_count)
 {
        int i;
-        ll_rw_block(SWRITE, *batch_count, bhs);
+        ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
        for (i = 0; i < *batch_count; i++) {
-                struct buffer_head *bh = bhs[i];
+                struct buffer_head *bh = journal->j_chkpt_bhs[i];
                clear_buffer_jwrite(bh);
                BUFFER_TRACE(bh, "brelse");
                __brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
 */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
-                        struct buffer_head **bhs, int *batch_count,
+                            int *batch_count, transaction_t *transaction)
-                        transaction_t *transaction)
 {
        struct buffer_head *bh = jh2bh(jh);
        int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                get_bh(bh);
                J_ASSERT_BH(bh, !buffer_jwrite(bh));
                set_buffer_jwrite(bh);
-                bhs[*batch_count] = bh;
+                journal->j_chkpt_bhs[*batch_count] = bh;
                __buffer_relink_io(jh);
                jbd_unlock_bh_state(bh);
                transaction->t_chp_stats.cs_written++;
                (*batch_count)++;
-                if (*batch_count == NR_BATCH) {
+                if (*batch_count == JBD2_NR_BATCH) {
                        spin_unlock(&journal->j_list_lock);
-                        __flush_batch(journal, bhs, batch_count);
+                        __flush_batch(journal, batch_count);
                        ret = 1;
                }
        }
@@ -388,7 +385,6 @@ restart:
        if (journal->j_checkpoint_transactions == transaction &&
                        transaction->t_tid == this_tid) {
                int batch_count = 0;
-                struct buffer_head *bhs[NR_BATCH];
                struct journal_head *jh;
                int retry = 0, err;
@@ -402,7 +398,7 @@ restart:
                                retry = 1;
                                break;
                        }
-                        retry = __process_buffer(journal, jh, bhs, &batch_count,
+                        retry = __process_buffer(journal, jh, &batch_count,
                                                 transaction);
                        if (retry < 0 && !result)
                                result = retry;
@@ -419,7 +415,7 @@ restart:
                                spin_unlock(&journal->j_list_lock);
                                retry = 1;
                        }
-                        __flush_batch(journal, bhs, &batch_count);
+                        __flush_batch(journal, &batch_count);
                }
                if (retry) {
@@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
           safely remove this transaction from the log */
        __jbd2_journal_drop_transaction(journal, transaction);
+        kfree(transaction);
        /* Just in case anybody was waiting for more transactions to be
           checkpointed... */
@@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
        J_ASSERT(journal->j_running_transaction != transaction);
        jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
-        kfree(transaction);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a8..62804e57a44c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
+#include <linux/bio.h>
 /*
 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
                set_buffer_ordered(bh);
                barrier_done = 1;
        }
-        ret = submit_bh(WRITE, bh);
+        ret = submit_bh(WRITE_SYNC, bh);
        if (barrier_done)
                clear_buffer_ordered(bh);
@@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
                lock_buffer(bh);
                set_buffer_uptodate(bh);
                clear_buffer_dirty(bh);
-                ret = submit_bh(WRITE, bh);
+                ret = submit_bh(WRITE_SYNC, bh);
        }
        *cbh = bh;
        return ret;
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
 * This function along with journal_submit_commit_record
 * allows to write the commit record asynchronously.
 */
-static int journal_wait_on_commit_record(struct buffer_head *bh)
+static int journal_wait_on_commit_record(journal_t *journal,
+                                         struct buffer_head *bh)
 {
        int ret = 0;
+retry:
        clear_buffer_dirty(bh);
        wait_on_buffer(bh);
+        if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
+                printk(KERN_WARNING
+                       "JBD2: wait_on_commit_record: sync failed on %s - "
+                       "disabling barriers\n", journal->j_devname);
+                spin_lock(&journal->j_state_lock);
+                journal->j_flags &= ~JBD2_BARRIER;
+                spin_unlock(&journal->j_state_lock);
+                lock_buffer(bh);
+                clear_buffer_dirty(bh);
+                set_buffer_uptodate(bh);
+                bh->b_end_io = journal_end_buffer_io_sync;
+                ret = submit_bh(WRITE_SYNC, bh);
+                if (ret) {
+                        unlock_buffer(bh);
+                        return ret;
+                }
+                goto retry;
+        }
        if (unlikely(!buffer_uptodate(bh)))
                ret = -EIO;
@@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int flags;
        int err;
        unsigned long long blocknr;
+        ktime_t start_time;
+        u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
        int space_left = 0;
        int first_tag = 0;
        int tag_flag;
-        int i;
+        int i, to_free = 0;
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
@@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+        start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
@@ -509,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                if (is_journal_aborted(journal)) {
                        clear_buffer_jbddirty(jh2bh(jh));
                        JBUFFER_TRACE(jh, "journal is aborting: refile");
+                        jbd2_buffer_abort_trigger(jh,
+                                                  jh->b_frozen_data ?
+                                                  jh->b_frozen_triggers :
+                                                  jh->b_triggers);
                        jbd2_journal_refile_buffer(journal, jh);
                        /* If that was the last one, we need to clean up
                         * any descriptor buffers which may have been
@@ -799,7 +829,7 @@ wait_for_iobuf:
                        __jbd2_journal_abort_hard(journal);
        }
        if (!err && !is_journal_aborted(journal))
-                err = journal_wait_on_commit_record(cbh);
+                err = journal_wait_on_commit_record(journal, cbh);
        if (err)
                jbd2_journal_abort(journal, err);
@@ -844,6 +874,9 @@ restart_loop:
                 * data.
                 *
                 * Otherwise, we can just throw away the frozen data now.
+                 *
+                 * We also know that the frozen data has already fired
+                 * its triggers if they exist, so we can clear that too.
                 */
                if (jh->b_committed_data) {
                        jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +884,12 @@ restart_loop:
                        if (jh->b_frozen_data) {
                                jh->b_committed_data = jh->b_frozen_data;
                                jh->b_frozen_data = NULL;
+                                jh->b_frozen_triggers = NULL;
                        }
                } else if (jh->b_frozen_data) {
                        jbd2_free(jh->b_frozen_data, bh->b_size);
                        jh->b_frozen_data = NULL;
+                        jh->b_frozen_triggers = NULL;
                }
                spin_lock(&journal->j_list_lock);
@@ -972,14 +1007,23 @@ restart_loop:
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
-        spin_unlock(&journal->j_state_lock);
+        commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
-        if (journal->j_commit_callback)
+        /*
-                journal->j_commit_callback(journal, commit_transaction);
+         * weight the commit time higher than the average time so we don't
+         * react too strongly to vast changes in the commit time
+         */
+        if (likely(journal->j_average_commit_time))
+                journal->j_average_commit_time = (commit_time +
+                                journal->j_average_commit_time*3) / 4;
+        else
+                journal->j_average_commit_time = commit_time;
+        spin_unlock(&journal->j_state_lock);
        if (commit_transaction->t_checkpoint_list == NULL &&
            commit_transaction->t_checkpoint_io_list == NULL) {
                __jbd2_journal_drop_transaction(journal, commit_transaction);
+                to_free = 1;
        } else {
                if (journal->j_checkpoint_transactions == NULL) {
                        journal->j_checkpoint_transactions = commit_transaction;
@@ -998,11 +1042,16 @@ restart_loop:
        }
        spin_unlock(&journal->j_list_lock);
+        if (journal->j_commit_callback)
+                journal->j_commit_callback(journal, commit_transaction);
        trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-                   journal->j_devname, journal->j_commit_sequence,
+                   journal->j_devname, commit_transaction->t_tid,
                   journal->j_tail_sequence);
        jbd_debug(1, "JBD: commit %d complete, head %d\n",
                  journal->j_commit_sequence, journal->j_tail_sequence);
+        if (to_free)
+                kfree(commit_transaction);
        wake_up(&journal->j_wait_done_commit);
 }
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f8..56675306ed81 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -40,6 +40,7 @@
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/div64.h>
 EXPORT_SYMBOL(jbd2_journal_start);
 EXPORT_SYMBOL(jbd2_journal_restart);
@@ -50,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -65,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
 EXPORT_SYMBOL(jbd2_journal_check_used_features);
 EXPORT_SYMBOL(jbd2_journal_check_available_features);
 EXPORT_SYMBOL(jbd2_journal_set_features);
-EXPORT_SYMBOL(jbd2_journal_create);
 EXPORT_SYMBOL(jbd2_journal_load);
 EXPORT_SYMBOL(jbd2_journal_destroy);
 EXPORT_SYMBOL(jbd2_journal_abort);
@@ -131,8 +132,9 @@ static int kjournald2(void *arg)
        journal->j_task = current;
        wake_up(&journal->j_wait_done_commit);
-        printk(KERN_INFO "kjournald2 starting.  Commit interval %ld seconds\n",
+        printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
-                        journal->j_commit_interval / HZ);
+               "commit interval %ld seconds\n", current->pid,
+               journal->j_devname, journal->j_commit_interval / HZ);
        /*
         * And now, wait forever for commit wakeup events.
@@ -290,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
+        struct jbd2_buffer_trigger_type *triggers;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -314,13 +317,23 @@ repeat:
                done_copy_out = 1;
                new_page = virt_to_page(jh_in->b_frozen_data);
                new_offset = offset_in_page(jh_in->b_frozen_data);
+                triggers = jh_in->b_frozen_triggers;
        } else {
                new_page = jh2bh(jh_in)->b_page;
                new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+                triggers = jh_in->b_triggers;
        }
        mapped_data = kmap_atomic(new_page, KM_USER0);
        /*
+         * Fire any commit trigger.  Do this before checking for escaping,
+         * as the trigger may modify the magic offset.  If a copy-out
+         * happens afterwards, it will have the correct data in the buffer.
+         */
+        jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+                                   triggers);
+        /*
         * Check for escaping
         */
        if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -352,6 +365,13 @@ repeat:
                new_page = virt_to_page(tmp);
                new_offset = offset_in_page(tmp);
                done_copy_out = 1;
+                /*
+                 * This isn't strictly necessary, as we're using frozen
+                 * data for the escaping, but it keeps consistency with
+                 * b_frozen_data usage.
+                 */
+                jh_in->b_frozen_triggers = jh_in->b_triggers;
        }
        /*
@@ -631,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
                return NULL;
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+        if (!bh)
+                return NULL;
        lock_buffer(bh);
        memset(bh->b_data, 0, journal->j_blocksize);
        set_buffer_uptodate(bh);
@@ -824,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
            jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
        seq_printf(seq, "  %ums logging transaction\n",
            jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+        seq_printf(seq, "  %luus average transaction commit time\n",
+                   do_div(s->journal->j_average_commit_time, 1000));
        seq_printf(seq, "  %lu handles per transaction\n",
            s->stats->u.run.rs_handle_count / s->stats->ts_tid);
        seq_printf(seq, "  %lu blocks per transaction\n",
@@ -961,6 +985,8 @@ static journal_t * journal_init_common (void)
        spin_lock_init(&journal->j_state_lock);
        journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+        journal->j_min_batch_time = 0;
+        journal->j_max_batch_time = 15000; /* 15ms */
        /* The journal is marked for error until we succeed with recovery! */
        journal->j_flags = JBD2_ABORT;
@@ -1016,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        /* journal descriptor can store up to n blocks -bzzz */
        journal->j_blocksize = blocksize;
+        jbd2_stats_proc_init(journal);
        n = journal->j_blocksize / sizeof(journal_block_tag_t);
        journal->j_wbufsize = n;
        journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-                kfree(journal);
+                goto out_err;
-                journal = NULL;
-                goto out;
        }
        journal->j_dev = bdev;
        journal->j_fs_dev = fs_dev;
@@ -1034,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
        p = journal->j_devname;
        while ((p = strchr(p, '/')))
                *p = '!';
-        jbd2_stats_proc_init(journal);
        bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-        J_ASSERT(bh != NULL);
+        if (!bh) {
+                printk(KERN_ERR
+                       "%s: Cannot get buffer for journal superblock\n",
+                       __func__);
+                goto out_err;
+        }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
        return journal;
+out_err:
+        jbd2_stats_proc_exit(journal);
+        kfree(journal);
+        return NULL;
 }
 /**
@@ -1089,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        if (!journal->j_wbuf) {
                printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
                        __func__);
-                jbd2_stats_proc_exit(journal);
+                goto out_err;
-                kfree(journal);
-                return NULL;
        }
        err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1099,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
        if (err) {
                printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
                       __func__);
-                jbd2_stats_proc_exit(journal);
+                goto out_err;
-                kfree(journal);
-                return NULL;
        }
        bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-        J_ASSERT(bh != NULL);
+        if (!bh) {
+                printk(KERN_ERR
+                       "%s: Cannot get buffer for journal superblock\n",
+                       __func__);
+                goto out_err;
+        }
        journal->j_sb_buffer = bh;
        journal->j_superblock = (journal_superblock_t *)bh->b_data;
        return journal;
+out_err:
+        jbd2_stats_proc_exit(journal);
+        kfree(journal);
+        return NULL;
 }
 /*
@@ -1158,77 +1196,6 @@ static int journal_reset(journal_t *journal)
 }
 /**
- * int jbd2_journal_create() - Initialise the new journal file
- * @journal: Journal to create. This structure must have been initialised
- *
- * Given a journal_t structure which tells us which disk blocks we can
- * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.
- **/
-int jbd2_journal_create(journal_t *journal)
-{
-        unsigned long long blocknr;
-        struct buffer_head *bh;
-        journal_superblock_t *sb;
-        int i, err;
-        if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
-                printk (KERN_ERR "Journal length (%d blocks) too short.\n",
-                        journal->j_maxlen);
-                journal_fail_superblock(journal);
-                return -EINVAL;
-        }
-        if (journal->j_inode == NULL) {
-                /*
-                 * We don't know what block to start at!
-                 */
-                printk(KERN_EMERG
-                       "%s: creation of journal on external device!\n",
-                       __func__);
-                BUG();
-        }
-        /* Zero out the entire journal on disk.  We cannot afford to
-           have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
-        jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
-        for (i = 0; i < journal->j_maxlen; i++) {
-                err = jbd2_journal_bmap(journal, i, &blocknr);
-                if (err)
-                        return err;
-                bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-                lock_buffer(bh);
-                memset (bh->b_data, 0, journal->j_blocksize);
-                BUFFER_TRACE(bh, "marking dirty");
-                mark_buffer_dirty(bh);
-                BUFFER_TRACE(bh, "marking uptodate");
-                set_buffer_uptodate(bh);
-                unlock_buffer(bh);
-                __brelse(bh);
-        }
-        sync_blockdev(journal->j_dev);
-        jbd_debug(1, "JBD: journal cleared.\n");
-        /* OK, fill in the initial static fields in the new superblock */
-        sb = journal->j_superblock;
-        sb->s_header.h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
-        sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-        sb->s_blocksize = cpu_to_be32(journal->j_blocksize);
-        sb->s_maxlen    = cpu_to_be32(journal->j_maxlen);
-        sb->s_first     = cpu_to_be32(1);
-        journal->j_transaction_sequence = 1;
-        journal->j_flags &= ~JBD2_ABORT;
-        journal->j_format_version = 2;
-        return journal_reset(journal);
-}
-/**
 * void jbd2_journal_update_superblock() - Update journal sb on disk.
 * @journal: The journal to update.
 * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -1472,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal)
        spin_lock(&journal->j_list_lock);
        while (journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
+                mutex_lock(&journal->j_checkpoint_mutex);
                jbd2_log_do_checkpoint(journal);
+                mutex_unlock(&journal->j_checkpoint_mutex);
                spin_lock(&journal->j_list_lock);
        }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599a..46b4e347ed7d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 {
        transaction->t_journal = journal;
        transaction->t_state = T_RUNNING;
+        transaction->t_start_time = ktime_get();
        transaction->t_tid = journal->j_transaction_sequence++;
        transaction->t_expires = jiffies + journal->j_commit_interval;
        spin_lock_init(&transaction->t_handle_lock);
@@ -741,6 +743,12 @@ done:
                source = kmap_atomic(page, KM_USER0);
                memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
                kunmap_atomic(source, KM_USER0);
+                /*
+                 * Now that the frozen data is saved off, we need to store
+                 * any matching triggers.
+                 */
+                jh->b_frozen_triggers = jh->b_triggers;
        }
        jbd_unlock_bh_state(bh);
@@ -944,6 +952,47 @@ out:
 }
 /**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head.  This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+                               struct jbd2_buffer_trigger_type *type)
+{
+        struct journal_head *jh = bh2jh(bh);
+        jh->b_triggers = type;
+}
+void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+                                struct jbd2_buffer_trigger_type *triggers)
+{
+        struct buffer_head *bh = jh2bh(jh);
+        if (!triggers || !triggers->t_commit)
+                return;
+        triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+}
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+                               struct jbd2_buffer_trigger_type *triggers)
+{
+        if (!triggers || !triggers->t_abort)
+                return;
+        triggers->t_abort(triggers, jh2bh(jh));
+}
+/**
 * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
 * @handle: transaction to add buffer to.
 * @bh: buffer to mark
@@ -1193,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle)
 {
        transaction_t *transaction = handle->h_transaction;
        journal_t *journal = transaction->t_journal;
-        int old_handle_count, err;
+        int err;
        pid_t pid;
        J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle)
        /*
         * Implement synchronous transaction batching.  If the handle
         * was synchronous, don't force a commit immediately.  Let's
-         * yield and let another thread piggyback onto this transaction.
+         * yield and let another thread piggyback onto this
-         * Keep doing that while new threads continue to arrive.
+         * transaction.  Keep doing that while new threads continue to
-         * It doesn't cost much - we're about to run a commit and sleep
+         * arrive.  It doesn't cost much - we're about to run a commit
-         * on IO anyway.  Speeds up many-threaded, many-dir operations
+         * and sleep on IO anyway.  Speeds up many-threaded, many-dir
-         * by 30x or more...
+         * operations by 30x or more...
         *
-         * But don't do this if this process was the most recent one to
+         * We try and optimize the sleep time against what the
-         * perform a synchronous write.  We do this to detect the case where a
+         * underlying disk can do, instead of having a static sleep
-         * single process is doing a stream of sync writes.  No point in waiting
+         * time.  This is useful for the case where our storage is so
-         * for joiners in that case.
+         * fast that it is more optimal to go ahead and force a flush
+         * and wait for the transaction to be committed than it is to
+         * wait for an arbitrary amount of time for new writers to
+         * join the transaction.  We achieve this by measuring how
+         * long it takes to commit a transaction, and compare it with
+         * how long this transaction has been running, and if run time
+         * < commit time then we sleep for the delta and commit.  This
+         * greatly helps super fast disks that would see slowdowns as
+         * more threads started doing fsyncs.
+         *
+         * But don't do this if this process was the most recent one
+         * to perform a synchronous write.  We do this to detect the
+         * case where a single process is doing a stream of sync
+         * writes.  No point in waiting for joiners in that case.
         */
        pid = current->pid;
        if (handle->h_sync && journal->j_last_sync_writer != pid) {
+                u64 commit_time, trans_time;
                journal->j_last_sync_writer = pid;
-                do {
-                        old_handle_count = transaction->t_handle_count;
+                spin_lock(&journal->j_state_lock);
-                        schedule_timeout_uninterruptible(1);
+                commit_time = journal->j_average_commit_time;
-                } while (old_handle_count != transaction->t_handle_count);
+                spin_unlock(&journal->j_state_lock);
+                trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+                                                   transaction->t_start_time));
+                commit_time = max_t(u64, commit_time,
+                                    1000*journal->j_min_batch_time);
+                commit_time = min_t(u64, commit_time,
+                                    1000*journal->j_max_batch_time);
+                if (trans_time < commit_time) {
+                        ktime_t expires = ktime_add_ns(ktime_get(),
+                                                       commit_time);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+                }
        }
        current->journal_info = NULL;
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index c73fa89b5f8a..170d289ac785 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -22,9 +22,7 @@
 #define BIT_DIVIDER_MIPS 1043
-static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */
+static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
-#include <linux/errno.h>
 struct pushpull {
        unsigned char *buf;
@@ -43,7 +41,9 @@ struct rubin_state {
        int bits[8];
 };
-static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
+static inline void init_pushpull(struct pushpull *pp, char *buf,
+                                 unsigned buflen, unsigned ofs,
+                                 unsigned reserve)
 {
        pp->buf = buf;
        pp->buflen = buflen;
@@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen
 static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
 {
-        if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
+        if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
                return -ENOSPC;
-        }
-        if (bit) {
+        if (bit)
-                pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
+                pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
-        }
+        else
-        else {
+                pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
-                pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
-        }
        pp->ofs++;
        return 0;
@@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits)
        rs->p = (long) (2 * UPPER_BIT_RUBIN);
        rs->bit_number = (long) 0;
        rs->bit_divider = div;
        for (c=0; c<8; c++)
                rs->bits[c] = bits[c];
 }
@@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
        long i0, i1;
        int ret;
-        while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
+        while ((rs->q >= UPPER_BIT_RUBIN) ||
+               ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
                rs->bit_number++;
                ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
@@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
                rs->p <<= 1;
        }
        i0 = A * rs->p / (A + B);
-        if (i0 <= 0) {
+        if (i0 <= 0)
                i0 = 1;
-        }
-        if (i0 >= rs->p) {
+        if (i0 >= rs->p)
                i0 = rs->p - 1;
-        }
        i1 = rs->p - i0;
        if (symbol == 0)
@@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits)
        /* behalve lower */
        rs->rec_q = 0;
-        for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
+        for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
+             rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
                ;
 }
-static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q)
+static void __do_decode(struct rubin_state *rs, unsigned long p,
+                        unsigned long q)
 {
        register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
        unsigned long rec_q;
@@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B)
                __do_decode(rs, p, q);
        i0 = A * rs->p / (A + B);
-        if (i0 <= 0) {
+        if (i0 <= 0)
                i0 = 1;
-        }
-        if (i0 >= rs->p) {
+        if (i0 >= rs->p)
                i0 = rs->p - 1;
-        }
        threshold = rs->q + i0;
        symbol = rs->rec_q >= threshold;
@@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte)
        struct rubin_state rs_copy;
        rs_copy = *rs;
-        for (i=0;i<8;i++) {
+        for (i=0; i<8; i++) {
-                ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1);
+                ret = encode(rs, rs->bit_divider-rs->bits[i],
+                             rs->bits[i], byte & 1);
                if (ret) {
                        /* Failed. Restore old state */
                        *rs = rs_copy;
                        return ret;
                }
-                byte=byte>>1;
+                byte >>= 1 ;
        }
        return 0;
 }
@@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs)
        int i, result = 0, bit_divider = rs->bit_divider;
        for (i = 0; i < 8; i++)
-                result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i;
+                result |= decode(rs, bit_divider - rs->bits[i],
+                                 rs->bits[i]) << i;
        return result;
 }
@@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs)
 static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
-                      unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen)
+                             unsigned char *cpage_out, uint32_t *sourcelen,
+                             uint32_t *dstlen)
        {
        int outpos = 0;
        int pos=0;
@@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
 int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
                   uint32_t *sourcelen, uint32_t *dstlen, void *model)
 {
-        return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+        return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+                                 cpage_out, sourcelen, dstlen);
 }
 #endif
 static int jffs2_dynrubin_compress(unsigned char *data_in,
@@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
                return -1;
        memset(histo, 0, 256);
-        for (i=0; i<mysrclen; i++) {
+        for (i=0; i<mysrclen; i++)
                histo[data_in[i]]++;
-        }
        memset(bits, 0, sizeof(int)*8);
        for (i=0; i<256; i++) {
                if (i&128)
@@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
                cpage_out[i] = bits[i];
        }
-        ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen);
+        ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
+                                &mydstlen);
        if (ret)
                return ret;
@@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
        return 0;
 }
-static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in,
+static void rubin_do_decompress(int bit_divider, int *bits,
-                         unsigned char *page_out, uint32_t srclen, uint32_t destlen)
+                                unsigned char *cdata_in, 
+                                unsigned char *page_out, uint32_t srclen,
+                                uint32_t destlen)
 {
        int outpos = 0;
        struct rubin_state rs;
@@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata
        init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
        init_decode(&rs, bit_divider, bits);
-        while (outpos < destlen) {
+        while (outpos < destlen)
                page_out[outpos++] = in_byte(&rs);
-        }
 }
@@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
                                      uint32_t sourcelen, uint32_t dstlen,
                                      void *model)
 {
-        rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+        rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+                            cpage_out, sourcelen, dstlen);
        return 0;
 }
@@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in,
        for (c=0; c<8; c++)
                bits[c] = data_in[c];
-        rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen);
+        rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
+                            dstlen);
        return 0;
 }
 static struct jffs2_compressor jffs2_rubinmips_comp = {
-    .priority = JFFS2_RUBINMIPS_PRIORITY,
+        .priority = JFFS2_RUBINMIPS_PRIORITY,
-    .name = "rubinmips",
+        .name = "rubinmips",
-    .compr = JFFS2_COMPR_DYNRUBIN,
+        .compr = JFFS2_COMPR_DYNRUBIN,
-    .compress = NULL, /*&jffs2_rubinmips_compress,*/
+        .compress = NULL, /*&jffs2_rubinmips_compress,*/
-    .decompress = &jffs2_rubinmips_decompress,
+        .decompress = &jffs2_rubinmips_decompress,
 #ifdef JFFS2_RUBINMIPS_DISABLED
-    .disabled = 1,
+        .disabled = 1,
 #else
-    .disabled = 0,
+        .disabled = 0,
 #endif
 };
 int jffs2_rubinmips_init(void)
 {
-    return jffs2_register_compressor(&jffs2_rubinmips_comp);
+        return jffs2_register_compressor(&jffs2_rubinmips_comp);
 }
 void jffs2_rubinmips_exit(void)
 {
-    jffs2_unregister_compressor(&jffs2_rubinmips_comp);
+        jffs2_unregister_compressor(&jffs2_rubinmips_comp);
 }
 static struct jffs2_compressor jffs2_dynrubin_comp = {
-    .priority = JFFS2_DYNRUBIN_PRIORITY,
+        .priority = JFFS2_DYNRUBIN_PRIORITY,
-    .name = "dynrubin",
+        .name = "dynrubin",
-    .compr = JFFS2_COMPR_RUBINMIPS,
+        .compr = JFFS2_COMPR_RUBINMIPS,
-    .compress = jffs2_dynrubin_compress,
+        .compress = jffs2_dynrubin_compress,
-    .decompress = &jffs2_dynrubin_decompress,
+        .decompress = &jffs2_dynrubin_decompress,
 #ifdef JFFS2_DYNRUBIN_DISABLED
-    .disabled = 1,
+        .disabled = 1,
 #else
-    .disabled = 0,
+        .disabled = 0,
 #endif
 };
 int jffs2_dynrubin_init(void)
 {
-    return jffs2_register_compressor(&jffs2_dynrubin_comp);
+        return jffs2_register_compressor(&jffs2_dynrubin_comp);
 }
 void jffs2_dynrubin_exit(void)
 {
-    jffs2_unregister_compressor(&jffs2_dynrubin_comp);
+        jffs2_unregister_compressor(&jffs2_dynrubin_comp);
 }
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 259461b910af..c32b4a1ad6cf 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 {
        /* For NAND, if the failure did not occur at the device level for a
           specific physical page, don't bother updating the bad block table. */
-        if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
+        if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
                /* We had a device-level failure to erase.  Let's see if we've
                   failed too many times. */
                if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr)
        struct erase_priv_struct *priv = (void *)instr->priv;
        if(instr->state != MTD_ERASE_DONE) {
-                printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state);
+                printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
+                        (unsigned long long)instr->addr, instr->state);
                jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
        } else {
                jffs2_erase_succeeded(priv->c, priv->jeb);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5a98aa87c853..5edc2bf20581 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
        uint32_t pageofs = index << PAGE_CACHE_SHIFT;
        int ret = 0;
-        pg = __grab_cache_page(mapping, index);
+        pg = grab_cache_page_write_begin(mapping, index, flags);
        if (!pg)
                return -ENOMEM;
        *pagep = pg;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 1750445556c3..507ed6ec1847 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c);
 void jffs2_free_raw_node_refs(struct jffs2_sb_info *c);
 struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset);
 void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
-struct rb_node *rb_next(struct rb_node *);
-struct rb_node *rb_prev(struct rb_node *);
-void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
 uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
 struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 210339784b56..b00ee9f05a06 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -59,8 +59,14 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
                if (inode->i_size >= IDATASIZE) {
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_mapping->a_ops = &jfs_aops;
-                } else
+                } else {
                        inode->i_op = &jfs_symlink_inode_operations;
+                        /*
+                         * The inline data should be null-terminated, but
+                         * don't let on-disk corruption crash the kernel
+                         */
+                        JFS_IP(inode)->i_inline[inode->i_size] = '\0';
+                }
        } else {
                inode->i_op = &jfs_file_inode_operations;
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d0..0f94381ca6d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
 /*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't want
- * special inodes in the fileset inode space, we hash them to a dummy head
+ * special inodes in the fileset inode space, we make them appear hashed,
+ * but do not put on any lists.
 */
-static HLIST_HEAD(aggregate_hash);
 /*
 * imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
        /* release the page */
        release_metapage(mp);
-        hlist_add_head(&ip->i_hash, &aggregate_hash);
+        /*
+         * that will look hashed, but won't be on any list; hlist_del()
+         * will work fine and require no locking.
+         */
+        ip->i_hash.pprev = &ip->i_hash.next;
        return (ip);
 }
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 70022fd1c539..d4d142c2edd4 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        inode = new_inode(sb);
        if (!inode) {
                jfs_warn("ialloc: new_inode returned NULL!");
-                return ERR_PTR(-ENOMEM);
+                rc = -ENOMEM;
+                goto fail;
        }
        jfs_inode = JFS_IP(inode);
@@ -89,8 +90,12 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
                jfs_warn("ialloc: diAlloc returned %d!", rc);
                if (rc == -EIO)
                        make_bad_inode(inode);
-                iput(inode);
+                goto fail_put;
-                return ERR_PTR(rc);
+        }
+        if (insert_inode_locked(inode) < 0) {
+                rc = -EINVAL;
+                goto fail_unlock;
        }
        inode->i_uid = current_fsuid();
@@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
         * Allocate inode to quota.
         */
        if (DQUOT_ALLOC_INODE(inode)) {
-                DQUOT_DROP(inode);
+                rc = -EDQUOT;
-                inode->i_flags |= S_NOQUOTA;
+                goto fail_drop;
-                inode->i_nlink = 0;
-                iput(inode);
-                return ERR_PTR(-EDQUOT);
        }
        inode->i_mode = mode;
@@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
        jfs_info("ialloc returns inode = 0x%p\n", inode);
        return inode;
+fail_drop:
+        DQUOT_DROP(inode);
+        inode->i_flags |= S_NOQUOTA;
+fail_unlock:
+        inode->i_nlink = 0;
+        unlock_new_inode(inode);
+fail_put:
+        iput(inode);
+fail:
+        return ERR_PTR(rc);
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index cc3cedffbfa1..b4de56b851e4 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        ip->i_fop = &jfs_file_operations;
        ip->i_mapping->a_ops = &jfs_aops;
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out2:
        free_UCSname(&dname);
@@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        ip->i_op = &jfs_dir_inode_operations;
        ip->i_fop = &jfs_dir_operations;
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        /* update parent directory inode */
@@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out2:
        free_UCSname(&dname);
@@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
                goto out3;
        }
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out2:
        free_UCSname(&dname);
@@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        jfs_ip->dev = new_encode_dev(rdev);
        init_special_inode(ip, ip->i_mode, rdev);
-        insert_inode_hash(ip);
        mark_inode_dirty(ip);
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
        if (rc) {
                free_ea_wmap(ip);
                ip->i_nlink = 0;
+                unlock_new_inode(ip);
                iput(ip);
-        } else
+        } else {
                d_instantiate(dentry, ip);
+                unlock_new_inode(ip);
+        }
      out1:
        free_UCSname(&dname);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0dae345e481b..b37d1f78b854 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -543,7 +543,7 @@ out_kfree:
        return ret;
 }
-static void jfs_write_super_lockfs(struct super_block *sb)
+static int jfs_freeze(struct super_block *sb)
 {
        struct jfs_sb_info *sbi = JFS_SBI(sb);
        struct jfs_log *log = sbi->log;
@@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb)
                lmLogShutdown(log);
                updateSuper(sb, FM_CLEAN);
        }
+        return 0;
 }
-static void jfs_unlockfs(struct super_block *sb)
+static int jfs_unfreeze(struct super_block *sb)
 {
        struct jfs_sb_info *sbi = JFS_SBI(sb);
        struct jfs_log *log = sbi->log;
@@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb)
                else
                        txResume(sb);
        }
+        return 0;
 }
 static int jfs_get_sb(struct file_system_type *fs_type,
@@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = {
        .delete_inode   = jfs_delete_inode,
        .put_super      = jfs_put_super,
        .sync_fs        = jfs_sync_fs,
-        .write_super_lockfs = jfs_write_super_lockfs,
+        .freeze_fs      = jfs_freeze,
-        .unlockfs       = jfs_unlockfs,
+        .unfreeze_fs    = jfs_unfreeze,
        .statfs         = jfs_statfs,
        .remount_fs     = jfs_remount,
        .show_options   = jfs_show_options,
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a8321902..49b44099dabb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
         */
        root->i_ino = 1;
        root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-        root->i_uid = root->i_gid = 0;
        root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
        dentry = d_alloc(NULL, &d_name);
        if (!dentry) {
@@ -360,7 +359,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
         */
        inode->i_ino = 1;
        inode->i_mode = S_IFDIR | 0755;
-        inode->i_uid = inode->i_gid = 0;
-        inode->i_blocks = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
                if (!inode)
                        goto out;
                inode->i_mode = S_IFREG | files->mode;
-                inode->i_uid = inode->i_gid = 0;
-                inode->i_blocks = 0;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_fop = files->ops;
                inode->i_ino = i;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 8307dd64bf46..1f3b0fc0d351 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
 #include <linux/smp_lock.h>
+#include <linux/kthread.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -60,7 +61,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
        host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
                                   nlm_init->protocol, nlm_version,
-                                   nlm_init->hostname);
+                                   nlm_init->hostname, nlm_init->noresvport);
        if (host == NULL) {
                lockd_down();
                return ERR_PTR(-ENOLCK);
@@ -191,11 +192,15 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 void
 nlmclnt_recovery(struct nlm_host *host)
 {
+        struct task_struct *task;
        if (!host->h_reclaiming++) {
                nlm_get_host(host);
-                __module_get(THIS_MODULE);
+                task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name);
-                if (kernel_thread(reclaimer, host, CLONE_FS | CLONE_FILES) < 0)
+                if (IS_ERR(task))
-                        module_put(THIS_MODULE);
+                        printk(KERN_ERR "lockd: unable to spawn reclaimer "
+                                "thread. Locks for %s won't be reclaimed! "
+                                "(%ld)\n", host->h_name, PTR_ERR(task));
        }
 }
@@ -207,7 +212,6 @@ reclaimer(void *ptr)
        struct file_lock *fl, *next;
        u32 nsmstate;
-        daemonize("%s-reclaim", host->h_name);
        allow_signal(SIGKILL);
        down_write(&host->h_rwsem);
@@ -233,7 +237,12 @@ restart:
        list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
                list_del_init(&fl->fl_u.nfs_fl.list);
-                /* Why are we leaking memory here? --okir */
+                /*
+                 * sending this thread a SIGKILL will result in any unreclaimed
+                 * locks being removed from the h_granted list. This means that
+                 * the kernel will not attempt to reclaim them again if a new
+                 * reclaimer thread is spawned for this host.
+                 */
                if (signalled())
                        continue;
                if (nlmclnt_reclaim(host, fl) != 0)
@@ -261,5 +270,5 @@ restart:
        nlm_release_host(host);
        lockd_down();
        unlock_kernel();
-        module_put_and_exit(0);
+        return 0;
 }
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 31668b690e03..dd7957064a8c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
 #define NLMCLNT_GRACE_WAIT      (5*HZ)
@@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
        unsigned char fl_type;
        int status = -ENOLCK;
-        if (nsm_monitor(host) < 0) {
+        if (nsm_monitor(host) < 0)
-                printk(KERN_NOTICE "lockd: failed to monitor %s\n",
-                                        host->h_name);
                goto out;
-        }
        fl->fl_flags |= FL_ACCESS;
        status = do_vfs_lock(fl);
        fl->fl_flags = fl_flags;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index e05d04416037..99d737bd4325 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -15,7 +15,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/mutex.h>
 #include <net/ipv6.h>
@@ -32,11 +31,6 @@ static int			nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 static void                     nlm_gc_hosts(void);
-static struct nsm_handle        *nsm_find(const struct sockaddr *sap,
-                                                const size_t salen,
-                                                const char *hostname,
-                                                const size_t hostname_len,
-                                                const int create);
 struct nlm_lookup_host_info {
        const int               server;         /* search for server|client */
@@ -48,6 +42,7 @@ struct nlm_lookup_host_info {
        const size_t            hostname_len;   /* it's length */
        const struct sockaddr   *src_sap;       /* our address (optional) */
        const size_t            src_len;        /* it's length */
+        const int               noresvport;     /* use non-priv port */
 };
 /*
@@ -104,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap)
        }
 }
-static void nlm_display_address(const struct sockaddr *sap,
-                                char *buf, const size_t len)
-{
-        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-        switch (sap->sa_family) {
-        case AF_UNSPEC:
-                snprintf(buf, len, "unspecified");
-                break;
-        case AF_INET:
-                snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-                break;
-        case AF_INET6:
-                if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-                        snprintf(buf, len, "%pI4",
-                                 &sin6->sin6_addr.s6_addr32[3]);
-                else
-                        snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-                break;
-        default:
-                snprintf(buf, len, "unsupported address family");
-                break;
-        }
-}
 /*
 * Common host lookup routine for server & client
 */
@@ -189,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
                atomic_inc(&nsm->sm_count);
        else {
                host = NULL;
-                nsm = nsm_find(ni->sap, ni->salen,
+                nsm = nsm_get_handle(ni->sap, ni->salen,
-                                ni->hostname, ni->hostname_len, 1);
+                                        ni->hostname, ni->hostname_len);
                if (!nsm) {
                        dprintk("lockd: nlm_lookup_host failed; "
                                "no nsm handle\n");
@@ -205,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
                goto out;
        }
        host->h_name       = nsm->sm_name;
+        host->h_addrbuf    = nsm->sm_addrbuf;
        memcpy(nlm_addr(host), ni->sap, ni->salen);
        host->h_addrlen = ni->salen;
        nlm_clear_port(nlm_addr(host));
@@ -222,6 +192,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
        host->h_nsmstate   = 0;                 /* real NSM state */
        host->h_nsmhandle  = nsm;
        host->h_server     = ni->server;
+        host->h_noresvport = ni->noresvport;
        hlist_add_head(&host->h_hash, chain);
        INIT_LIST_HEAD(&host->h_lockowners);
        spin_lock_init(&host->h_lock);
@@ -230,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
        nrhosts++;
-        nlm_display_address((struct sockaddr *)&host->h_addr,
-                                host->h_addrbuf, sizeof(host->h_addrbuf));
-        nlm_display_address((struct sockaddr *)&host->h_srcaddr,
-                                host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
        dprintk("lockd: nlm_lookup_host created host %s\n",
                        host->h_name);
@@ -254,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host)
        BUG_ON(!list_empty(&host->h_lockowners));
        BUG_ON(atomic_read(&host->h_count));
-        /*
-         * Release NSM handle and unmonitor host.
-         */
        nsm_unmonitor(host);
+        nsm_release(host->h_nsmhandle);
        clnt = host->h_rpcclnt;
        if (clnt != NULL)
@@ -272,6 +236,7 @@ nlm_destroy_host(struct nlm_host *host)
 * @protocol: transport protocol to use
 * @version: NLM protocol version
 * @hostname: '\0'-terminated hostname of server
+ * @noresvport: 1 if non-privileged port should be used
 *
 * Returns an nlm_host structure that matches the passed-in
 * [server address, transport protocol, NLM version, server hostname].
@@ -281,7 +246,9 @@ nlm_destroy_host(struct nlm_host *host)
 struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                                     const size_t salen,
                                     const unsigned short protocol,
-                                     const u32 version, const char *hostname)
+                                     const u32 version,
+                                     const char *hostname,
+                                     int noresvport)
 {
        const struct sockaddr source = {
                .sa_family      = AF_UNSPEC,
@@ -296,6 +263,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
                .hostname_len   = strlen(hostname),
                .src_sap        = &source,
                .src_len        = sizeof(source),
+                .noresvport     = noresvport,
        };
        dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
@@ -372,8 +340,8 @@ nlm_bind_host(struct nlm_host *host)
 {
        struct rpc_clnt *clnt;
-        dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
+        dprintk("lockd: nlm_bind_host %s (%s)\n",
-                        host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
+                        host->h_name, host->h_addrbuf);
        /* Lock host handle */
        mutex_lock(&host->h_mutex);
@@ -417,6 +385,8 @@ nlm_bind_host(struct nlm_host *host)
                 */
                if (!host->h_server)
                        args.flags |= RPC_CLNT_CREATE_HARDRTRY;
+                if (host->h_noresvport)
+                        args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
                clnt = rpc_create(&args);
                if (!IS_ERR(clnt))
@@ -473,35 +443,23 @@ void nlm_release_host(struct nlm_host *host)
        }
 }
-/*
+/**
- * We were notified that the host indicated by address &sin
+ * nlm_host_rebooted - Release all resources held by rebooted host
- * has rebooted.
+ * @info: pointer to decoded results of NLM_SM_NOTIFY call
- * Release all resources held by that peer.
+ *
+ * We were notified that the specified host has rebooted.  Release
+ * all resources held by that peer.
 */
-void nlm_host_rebooted(const struct sockaddr_in *sin,
+void nlm_host_rebooted(const struct nlm_reboot *info)
-                                const char *hostname,
-                                unsigned int hostname_len,
-                                u32 new_state)
 {
        struct hlist_head *chain;
        struct hlist_node *pos;
        struct nsm_handle *nsm;
        struct nlm_host *host;
-        nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
+        nsm = nsm_reboot_lookup(info);
-                        hostname, hostname_len, 0);
+        if (unlikely(nsm == NULL))
-        if (nsm == NULL) {
-                dprintk("lockd: never saw rebooted peer '%.*s' before\n",
-                                hostname_len, hostname);
                return;
-        }
-        dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
-                        hostname_len, hostname, nsm->sm_addrbuf);
-        /* When reclaiming locks on this peer, make sure that
-         * we set up a new notification */
-        nsm->sm_monitored = 0;
        /* Mark all hosts tied to this NSM state as having rebooted.
         * We run the loop repeatedly, because we drop the host table
@@ -512,8 +470,8 @@ again:	mutex_lock(&nlm_host_mutex);
        for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
                hlist_for_each_entry(host, pos, chain, h_hash) {
                        if (host->h_nsmhandle == nsm
-                         && host->h_nsmstate != new_state) {
+                         && host->h_nsmstate != info->state) {
-                                host->h_nsmstate = new_state;
+                                host->h_nsmstate = info->state;
                                host->h_state++;
                                nlm_get_host(host);
@@ -621,89 +579,3 @@ nlm_gc_hosts(void)
        next_gc = jiffies + NLM_HOST_COLLECT;
 }
-/*
- * Manage NSM handles
- */
-static LIST_HEAD(nsm_handles);
-static DEFINE_SPINLOCK(nsm_lock);
-static struct nsm_handle *nsm_find(const struct sockaddr *sap,
-                                   const size_t salen,
-                                   const char *hostname,
-                                   const size_t hostname_len,
-                                   const int create)
-{
-        struct nsm_handle *nsm = NULL;
-        struct nsm_handle *pos;
-        if (!sap)
-                return NULL;
-        if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
-                if (printk_ratelimit()) {
-                        printk(KERN_WARNING "Invalid hostname \"%.*s\" "
-                                            "in NFS lock request\n",
-                                (int)hostname_len, hostname);
-                }
-                return NULL;
-        }
-retry:
-        spin_lock(&nsm_lock);
-        list_for_each_entry(pos, &nsm_handles, sm_link) {
-                if (hostname && nsm_use_hostnames) {
-                        if (strlen(pos->sm_name) != hostname_len
-                         || memcmp(pos->sm_name, hostname, hostname_len))
-                                continue;
-                } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
-                        continue;
-                atomic_inc(&pos->sm_count);
-                kfree(nsm);
-                nsm = pos;
-                goto found;
-        }
-        if (nsm) {
-                list_add(&nsm->sm_link, &nsm_handles);
-                goto found;
-        }
-        spin_unlock(&nsm_lock);
-        if (!create)
-                return NULL;
-        nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
-        if (nsm == NULL)
-                return NULL;
-        memcpy(nsm_addr(nsm), sap, salen);
-        nsm->sm_addrlen = salen;
-        nsm->sm_name = (char *) (nsm + 1);
-        memcpy(nsm->sm_name, hostname, hostname_len);
-        nsm->sm_name[hostname_len] = '\0';
-        nlm_display_address((struct sockaddr *)&nsm->sm_addr,
-                                nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
-        atomic_set(&nsm->sm_count, 1);
-        goto retry;
-found:
-        spin_unlock(&nsm_lock);
-        return nsm;
-}
-/*
- * Release an NSM handle
- */
-void
-nsm_release(struct nsm_handle *nsm)
-{
-        if (!nsm)
-                return;
-        if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
-                list_del(&nsm->sm_link);
-                spin_unlock(&nsm_lock);
-                kfree(nsm);
-        }
-}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index ffd3461f75ef..5e2c4d5ac827 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -9,35 +9,123 @@
 #include <linux/types.h>
 #include <linux/utsname.h>
 #include <linux/kernel.h>
+#include <linux/ktime.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_MONITOR
+#define NSM_PROGRAM             100024
+#define NSM_VERSION             1
+enum {
+        NSMPROC_NULL,
+        NSMPROC_STAT,
+        NSMPROC_MON,
+        NSMPROC_UNMON,
+        NSMPROC_UNMON_ALL,
+        NSMPROC_SIMU_CRASH,
+        NSMPROC_NOTIFY,
+};
+struct nsm_args {
+        struct nsm_private      *priv;
+        u32                     prog;           /* RPC callback info */
+        u32                     vers;
+        u32                     proc;
-#define XDR_ADDRBUF_LEN         (20)
+        char                    *mon_name;
+};
-static struct rpc_clnt *        nsm_create(void);
+struct nsm_res {
+        u32                     status;
+        u32                     state;
+};
 static struct rpc_program       nsm_program;
+static                          LIST_HEAD(nsm_handles);
+static                          DEFINE_SPINLOCK(nsm_lock);
 /*
 * Local NSM state
 */
-int                             nsm_local_state;
+int     __read_mostly           nsm_local_state;
+int     __read_mostly           nsm_use_hostnames;
-/*
+static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
- * Common procedure for SM_MON/SM_UNMON calls
+{
- */
+        return (struct sockaddr *)&nsm->sm_addr;
-static int
+}
-nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
+static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
+                                     const size_t len)
+{
+        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+        snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
+}
+static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
+                                     const size_t len)
+{
+        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+        if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+                snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
+        else if (sin6->sin6_scope_id != 0)
+                snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
+                                sin6->sin6_scope_id);
+        else
+                snprintf(buf, len, "%pI6", &sin6->sin6_addr);
+}
+static void nsm_display_address(const struct sockaddr *sap,
+                                char *buf, const size_t len)
+{
+        switch (sap->sa_family) {
+        case AF_INET:
+                nsm_display_ipv4_address(sap, buf, len);
+                break;
+        case AF_INET6:
+                nsm_display_ipv6_address(sap, buf, len);
+                break;
+        default:
+                snprintf(buf, len, "unsupported address family");
+                break;
+        }
+}
+static struct rpc_clnt *nsm_create(void)
+{
+        struct sockaddr_in sin = {
+                .sin_family             = AF_INET,
+                .sin_addr.s_addr        = htonl(INADDR_LOOPBACK),
+        };
+        struct rpc_create_args args = {
+                .protocol               = XPRT_TRANSPORT_UDP,
+                .address                = (struct sockaddr *)&sin,
+                .addrsize               = sizeof(sin),
+                .servername             = "rpc.statd",
+                .program                = &nsm_program,
+                .version                = NSM_VERSION,
+                .authflavor             = RPC_AUTH_NULL,
+        };
+        return rpc_create(&args);
+}
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 {
        struct rpc_clnt *clnt;
        int             status;
-        struct nsm_args args;
+        struct nsm_args args = {
+                .priv           = &nsm->sm_priv,
+                .prog           = NLM_PROGRAM,
+                .vers           = 3,
+                .proc           = NLMPROC_NSM_NOTIFY,
+                .mon_name       = nsm->sm_mon_name,
+        };
        struct rpc_message msg = {
                .rpc_argp       = &args,
                .rpc_resp       = res,
@@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
        clnt = nsm_create();
        if (IS_ERR(clnt)) {
                status = PTR_ERR(clnt);
+                dprintk("lockd: failed to create NSM upcall transport, "
+                                "status=%d\n", status);
                goto out;
        }
-        memset(&args, 0, sizeof(args));
-        args.mon_name = nsm->sm_name;
-        args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
-        args.prog = NLM_PROGRAM;
-        args.vers = 3;
-        args.proc = NLMPROC_NSM_NOTIFY;
        memset(res, 0, sizeof(*res));
        msg.rpc_proc = &clnt->cl_procinfo[proc];
        status = rpc_call_sync(clnt, &msg, 0);
        if (status < 0)
-                printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n",
+                dprintk("lockd: NSM upcall RPC failed, status=%d\n",
-                        status);
+                                status);
        else
                status = 0;
        rpc_shutdown_client(clnt);
@@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
        return status;
 }
-/*
+/**
- * Set up monitoring of a remote host
+ * nsm_monitor - Notify a peer in case we reboot
+ * @host: pointer to nlm_host of peer to notify
+ *
+ * If this peer is not already monitored, this function sends an
+ * upcall to the local rpc.statd to record the name/address of
+ * the peer to notify in case we reboot.
+ *
+ * Returns zero if the peer is monitored by the local rpc.statd;
+ * otherwise a negative errno value is returned.
 */
-int
+int nsm_monitor(const struct nlm_host *host)
-nsm_monitor(struct nlm_host *host)
 {
        struct nsm_handle *nsm = host->h_nsmhandle;
        struct nsm_res  res;
        int             status;
-        dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
+        dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
-        BUG_ON(nsm == NULL);
        if (nsm->sm_monitored)
                return 0;
-        status = nsm_mon_unmon(nsm, SM_MON, &res);
+        /*
+         * Choose whether to record the caller_name or IP address of
+         * this peer in the local rpc.statd's database.
+         */
+        nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
-        if (status < 0 || res.status != 0)
+        status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
-                printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
+        if (res.status != 0)
+                status = -EIO;
+        if (status < 0)
+                printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
        else
                nsm->sm_monitored = 1;
        return status;
 }
-/*
+/**
- * Cease to monitor remote host
+ * nsm_unmonitor - Unregister peer notification
+ * @host: pointer to nlm_host of peer to stop monitoring
+ *
+ * If this peer is monitored, this function sends an upcall to
+ * tell the local rpc.statd not to send this peer a notification
+ * when we reboot.
 */
-int
+void nsm_unmonitor(const struct nlm_host *host)
-nsm_unmonitor(struct nlm_host *host)
 {
        struct nsm_handle *nsm = host->h_nsmhandle;
        struct nsm_res  res;
-        int             status = 0;
+        int status;
-        if (nsm == NULL)
-                return 0;
-        host->h_nsmhandle = NULL;
        if (atomic_read(&nsm->sm_count) == 1
         && nsm->sm_monitored && !nsm->sm_sticky) {
-                dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+                dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
-                status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+                status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
+                if (res.status != 0)
+                        status = -EIO;
                if (status < 0)
                        printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
-                                        host->h_name);
+                                        nsm->sm_name);
                else
                        nsm->sm_monitored = 0;
        }
-        nsm_release(nsm);
+}
-        return status;
+static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
+                                              const size_t len)
+{
+        struct nsm_handle *nsm;
+        list_for_each_entry(nsm, &nsm_handles, sm_link)
+                if (strlen(nsm->sm_name) == len &&
+                    memcmp(nsm->sm_name, hostname, len) == 0)
+                        return nsm;
+        return NULL;
+}
+static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+{
+        struct nsm_handle *nsm;
+        list_for_each_entry(nsm, &nsm_handles, sm_link)
+                if (nlm_cmp_addr(nsm_addr(nsm), sap))
+                        return nsm;
+        return NULL;
+}
+static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+{
+        struct nsm_handle *nsm;
+        list_for_each_entry(nsm, &nsm_handles, sm_link)
+                if (memcmp(nsm->sm_priv.data, priv->data,
+                                        sizeof(priv->data)) == 0)
+                        return nsm;
+        return NULL;
 }
 /*
- * Create NSM client for the local host
+ * Construct a unique cookie to match this nsm_handle to this monitored
+ * host.  It is passed to the local rpc.statd via NSMPROC_MON, and
+ * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
+ * requests.
+ *
+ * The NSM protocol requires that these cookies be unique while the
+ * system is running.  We prefer a stronger requirement of making them
+ * unique across reboots.  If user space bugs cause a stale cookie to
+ * be sent to the kernel, it could cause the wrong host to lose its
+ * lock state if cookies were not unique across reboots.
+ *
+ * The cookies are exposed only to local user space via loopback.  They
+ * do not appear on the physical network.  If we want greater security
+ * for some reason, nsm_init_private() could perform a one-way hash to
+ * obscure the contents of the cookie.
 */
-static struct rpc_clnt *
+static void nsm_init_private(struct nsm_handle *nsm)
-nsm_create(void)
 {
-        struct sockaddr_in      sin = {
+        u64 *p = (u64 *)&nsm->sm_priv.data;
-                .sin_family     = AF_INET,
+        struct timespec ts;
-                .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
-                .sin_port       = 0,
-        };
-        struct rpc_create_args args = {
-                .protocol       = XPRT_TRANSPORT_UDP,
-                .address        = (struct sockaddr *)&sin,
-                .addrsize       = sizeof(sin),
-                .servername     = "localhost",
-                .program        = &nsm_program,
-                .version        = SM_VERSION,
-                .authflavor     = RPC_AUTH_NULL,
-        };
-        return rpc_create(&args);
+        ktime_get_ts(&ts);
+        *p++ = timespec_to_ns(&ts);
+        *p = (unsigned long)nsm;
+}
+static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
+                                            const size_t salen,
+                                            const char *hostname,
+                                            const size_t hostname_len)
+{
+        struct nsm_handle *new;
+        new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
+        if (unlikely(new == NULL))
+                return NULL;
+        atomic_set(&new->sm_count, 1);
+        new->sm_name = (char *)(new + 1);
+        memcpy(nsm_addr(new), sap, salen);
+        new->sm_addrlen = salen;
+        nsm_init_private(new);
+        nsm_display_address((const struct sockaddr *)&new->sm_addr,
+                                new->sm_addrbuf, sizeof(new->sm_addrbuf));
+        memcpy(new->sm_name, hostname, hostname_len);
+        new->sm_name[hostname_len] = '\0';
+        return new;
+}
+/**
+ * nsm_get_handle - Find or create a cached nsm_handle
+ * @sap: pointer to socket address of handle to find
+ * @salen: length of socket address
+ * @hostname: pointer to C string containing hostname to find
+ * @hostname_len: length of C string
+ *
+ * Behavior is modulated by the global nsm_use_hostnames variable.
+ *
+ * Returns a cached nsm_handle after bumping its ref count, or
+ * returns a fresh nsm_handle if a handle that matches @sap and/or
+ * @hostname cannot be found in the handle cache.  Returns NULL if
+ * an error occurs.
+ */
+struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+                                  const size_t salen, const char *hostname,
+                                  const size_t hostname_len)
+{
+        struct nsm_handle *cached, *new = NULL;
+        if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+                if (printk_ratelimit()) {
+                        printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+                                            "in NFS lock request\n",
+                                (int)hostname_len, hostname);
+                }
+                return NULL;
+        }
+retry:
+        spin_lock(&nsm_lock);
+        if (nsm_use_hostnames && hostname != NULL)
+                cached = nsm_lookup_hostname(hostname, hostname_len);
+        else
+                cached = nsm_lookup_addr(sap);
+        if (cached != NULL) {
+                atomic_inc(&cached->sm_count);
+                spin_unlock(&nsm_lock);
+                kfree(new);
+                dprintk("lockd: found nsm_handle for %s (%s), "
+                                "cnt %d\n", cached->sm_name,
+                                cached->sm_addrbuf,
+                                atomic_read(&cached->sm_count));
+                return cached;
+        }
+        if (new != NULL) {
+                list_add(&new->sm_link, &nsm_handles);
+                spin_unlock(&nsm_lock);
+                dprintk("lockd: created nsm_handle for %s (%s)\n",
+                                new->sm_name, new->sm_addrbuf);
+                return new;
+        }
+        spin_unlock(&nsm_lock);
+        new = nsm_create_handle(sap, salen, hostname, hostname_len);
+        if (unlikely(new == NULL))
+                return NULL;
+        goto retry;
+}
+/**
+ * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @info: pointer to NLMPROC_SM_NOTIFY arguments
+ *
+ * Returns a matching nsm_handle if found in the nsm cache; the returned
+ * nsm_handle's reference count is bumped and sm_monitored is cleared.
+ * Otherwise returns NULL if some error occurred.
+ */
+struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+{
+        struct nsm_handle *cached;
+        spin_lock(&nsm_lock);
+        cached = nsm_lookup_priv(&info->priv);
+        if (unlikely(cached == NULL)) {
+                spin_unlock(&nsm_lock);
+                dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+                                info->len, info->mon);
+                return cached;
+        }
+        atomic_inc(&cached->sm_count);
+        spin_unlock(&nsm_lock);
+        /*
+         * During subsequent lock activity, force a fresh
+         * notification to be set up for this host.
+         */
+        cached->sm_monitored = 0;
+        dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
+                        cached->sm_name, cached->sm_addrbuf,
+                        atomic_read(&cached->sm_count));
+        return cached;
+}
+/**
+ * nsm_release - Release an NSM handle
+ * @nsm: pointer to handle to be released
+ *
+ */
+void nsm_release(struct nsm_handle *nsm)
+{
+        if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
+                list_del(&nsm->sm_link);
+                spin_unlock(&nsm_lock);
+                dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
+                                nsm->sm_name, nsm->sm_addrbuf);
+                kfree(nsm);
+        }
 }
 /*
@@ -154,127 +428,132 @@ nsm_create(void)
 * Status Monitor wire protocol.
 */
-static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
+static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
-        size_t len = strlen(string);
+        const u32 len = strlen(string);
+        __be32 *p;
-        if (len > SM_MAXSTRLEN)
-                len = SM_MAXSTRLEN;
+        if (unlikely(len > SM_MAXSTRLEN))
-        return xdr_encode_opaque(p, string, len);
+                return -EIO;
+        p = xdr_reserve_space(xdr, sizeof(u32) + len);
+        if (unlikely(p == NULL))
+                return -EIO;
+        xdr_encode_opaque(p, string, len);
+        return 0;
 }
 /*
 * "mon_name" specifies the host to be monitored.
- *
- * Linux uses a text version of the IP address of the remote
- * host as the host identifier (the "mon_name" argument).
- *
- * Linux statd always looks up the canonical hostname first for
- * whatever remote hostname it receives, so this works alright.
 */
-static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
+static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        char    buffer[XDR_ADDRBUF_LEN + 1];
+        return encode_nsm_string(xdr, argp->mon_name);
-        char    *name = argp->mon_name;
-        if (!nsm_use_hostnames) {
-                snprintf(buffer, XDR_ADDRBUF_LEN,
-                         "%pI4", &argp->addr);
-                name = buffer;
-        }
-        return xdr_encode_nsm_string(p, name);
 }
 /*
 * The "my_id" argument specifies the hostname and RPC procedure
 * to be called when the status manager receives notification
- * (via the SM_NOTIFY call) that the state of host "mon_name"
+ * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
 * has changed.
 */
-static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
+static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        p = xdr_encode_nsm_string(p, utsname()->nodename);
+        int status;
-        if (!p)
+        __be32 *p;
-                return ERR_PTR(-EIO);
+        status = encode_nsm_string(xdr, utsname()->nodename);
+        if (unlikely(status != 0))
+                return status;
+        p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+        if (unlikely(p == NULL))
+                return -EIO;
        *p++ = htonl(argp->prog);
        *p++ = htonl(argp->vers);
        *p++ = htonl(argp->proc);
+        return 0;
-        return p;
 }
 /*
 * The "mon_id" argument specifies the non-private arguments
- * of an SM_MON or SM_UNMON call.
+ * of an NSMPROC_MON or NSMPROC_UNMON call.
 */
-static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
+static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        p = xdr_encode_mon_name(p, argp);
+        int status;
-        if (!p)
-                return ERR_PTR(-EIO);
-        return xdr_encode_my_id(p, argp);
+        status = encode_mon_name(xdr, argp);
+        if (unlikely(status != 0))
+                return status;
+        return encode_my_id(xdr, argp);
 }
 /*
 * The "priv" argument may contain private information required
- * by the SM_MON call. This information will be supplied in the
+ * by the NSMPROC_MON call. This information will be supplied in the
- * SM_NOTIFY call.
+ * NLMPROC_SM_NOTIFY call.
- *
- * Linux provides the raw IP address of the monitored host,
- * left in network byte order.
 */
-static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp)
+static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-        *p++ = argp->addr;
+        __be32 *p;
-        *p++ = 0;
-        *p++ = 0;
-        *p++ = 0;
-        return p;
+        p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
+        if (unlikely(p == NULL))
+                return -EIO;
+        xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
+        return 0;
 }
-static int
+static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
-xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+                       const struct nsm_args *argp)
 {
-        p = xdr_encode_mon_id(p, argp);
+        struct xdr_stream xdr;
-        if (IS_ERR(p))
+        int status;
-                return PTR_ERR(p);
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        p = xdr_encode_priv(p, argp);
+        status = encode_mon_id(&xdr, argp);
-        if (IS_ERR(p))
+        if (unlikely(status))
-                return PTR_ERR(p);
+                return status;
+        return encode_priv(&xdr, argp);
-        rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
-        return 0;
 }
-static int
+static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
-xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+                         const struct nsm_args *argp)
 {
-        p = xdr_encode_mon_id(p, argp);
+        struct xdr_stream xdr;
-        if (IS_ERR(p))
-                return PTR_ERR(p);
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
+        return encode_mon_id(&xdr, argp);
-        return 0;
 }
-static int
+static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
-xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+                            struct nsm_res *resp)
 {
+        struct xdr_stream xdr;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
+        if (unlikely(p == NULL))
+                return -EIO;
        resp->status = ntohl(*p++);
-        resp->state = ntohl(*p++);
+        resp->state = ntohl(*p);
-        dprintk("nsm: xdr_decode_stat_res status %d state %d\n",
+        dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
                        resp->status, resp->state);
        return 0;
 }
-static int
+static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
-xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+                        struct nsm_res *resp)
 {
-        resp->state = ntohl(*p++);
+        struct xdr_stream xdr;
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        p = xdr_inline_decode(&xdr, sizeof(u32));
+        if (unlikely(p == NULL))
+                return -EIO;
+        resp->state = ntohl(*p);
+        dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
        return 0;
 }
@@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 #define SM_unmonres_sz  1
 static struct rpc_procinfo      nsm_procedures[] = {
-[SM_MON] = {
+[NSMPROC_MON] = {
-                .p_proc         = SM_MON,
+                .p_proc         = NSMPROC_MON,
-                .p_encode       = (kxdrproc_t) xdr_encode_mon,
+                .p_encode       = (kxdrproc_t)xdr_enc_mon,
-                .p_decode       = (kxdrproc_t) xdr_decode_stat_res,
+                .p_decode       = (kxdrproc_t)xdr_dec_stat_res,
                .p_arglen       = SM_mon_sz,
                .p_replen       = SM_monres_sz,
-                .p_statidx      = SM_MON,
+                .p_statidx      = NSMPROC_MON,
                .p_name         = "MONITOR",
        },
-[SM_UNMON] = {
+[NSMPROC_UNMON] = {
-                .p_proc         = SM_UNMON,
+                .p_proc         = NSMPROC_UNMON,
-                .p_encode       = (kxdrproc_t) xdr_encode_unmon,
+                .p_encode       = (kxdrproc_t)xdr_enc_unmon,
-                .p_decode       = (kxdrproc_t) xdr_decode_stat,
+                .p_decode       = (kxdrproc_t)xdr_dec_stat,
                .p_arglen       = SM_mon_id_sz,
                .p_replen       = SM_unmonres_sz,
-                .p_statidx      = SM_UNMON,
+                .p_statidx      = NSMPROC_UNMON,
                .p_name         = "UNMONITOR",
        },
 };
@@ -322,7 +601,7 @@ static struct rpc_stat		nsm_stats;
 static struct rpc_program       nsm_program = {
                .name           = "statd",
-                .number         = SM_PROGRAM,
+                .number         = NSM_PROGRAM,
                .nrvers         = ARRAY_SIZE(nsm_version),
                .version        = nsm_version,
                .stats          = &nsm_stats
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 56b076736b56..64f1c31b5853 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,7 +35,6 @@
 #include <linux/sunrpc/svcsock.h>
 #include <net/ip.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/nfs.h>
 #define NLMDBG_FACILITY         NLMDBG_SVC
@@ -45,7 +44,7 @@
 static struct svc_program       nlmsvc_program;
 struct nlmsvc_binding *         nlmsvc_ops;
-EXPORT_SYMBOL(nlmsvc_ops);
+EXPORT_SYMBOL_GPL(nlmsvc_ops);
 static DEFINE_MUTEX(nlmsvc_mutex);
 static unsigned int             nlmsvc_users;
@@ -54,13 +53,26 @@ static struct svc_rqst		*nlmsvc_rqst;
 unsigned long                   nlmsvc_timeout;
 /*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
+        defined(CONFIG_SUNRPC_REGISTER_V4)
+static const sa_family_t        nlmsvc_family = AF_INET6;
+#else   /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+static const sa_family_t        nlmsvc_family = AF_INET;
+#endif  /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+/*
 * These can be set at insmod time (useful for NFS as root filesystem),
 * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
 */
 static unsigned long            nlm_grace_period;
 static unsigned long            nlm_timeout = LOCKD_DFLT_TIMEO;
 static int                      nlm_udpport, nlm_tcpport;
-int                             nsm_use_hostnames = 0;
+/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
+static unsigned int             nlm_max_connections = 1024;
 /*
 * Constants needed for the sysctl interface.
@@ -143,6 +155,9 @@ lockd(void *vrqstp)
                long timeout = MAX_SCHEDULE_TIMEOUT;
                RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+                /* update sv_maxconn if it has changed */
+                rqstp->rq_server->sv_maxconn = nlm_max_connections;
                if (signalled()) {
                        flush_signals(current);
                        if (nlmsvc_ops) {
@@ -189,6 +204,19 @@ lockd(void *vrqstp)
        return 0;
 }
+static int create_lockd_listener(struct svc_serv *serv, char *name,
+                                 unsigned short port)
+{
+        struct svc_xprt *xprt;
+        xprt = svc_find_xprt(serv, name, 0, 0);
+        if (xprt == NULL)
+                return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+        svc_xprt_put(xprt);
+        return 0;
+}
 /*
 * Ensure there are active UDP and TCP listeners for lockd.
 *
@@ -202,29 +230,23 @@ lockd(void *vrqstp)
 static int make_socks(struct svc_serv *serv)
 {
        static int warned;
-        struct svc_xprt *xprt;
+        int err;
-        int err = 0;
-        xprt = svc_find_xprt(serv, "udp", 0, 0);
+        err = create_lockd_listener(serv, "udp", nlm_udpport);
-        if (!xprt)
+        if (err < 0)
-                err = svc_create_xprt(serv, "udp", nlm_udpport,
+                goto out_err;
-                                      SVC_SOCK_DEFAULTS);
-        else
+        err = create_lockd_listener(serv, "tcp", nlm_tcpport);
-                svc_xprt_put(xprt);
+        if (err < 0)
-        if (err >= 0) {
+                goto out_err;
-                xprt = svc_find_xprt(serv, "tcp", 0, 0);
-                if (!xprt)
+        warned = 0;
-                        err = svc_create_xprt(serv, "tcp", nlm_tcpport,
+        return 0;
-                                              SVC_SOCK_DEFAULTS);
-                else
+out_err:
-                        svc_xprt_put(xprt);
+        if (warned++ == 0)
-        }
-        if (err >= 0) {
-                warned = 0;
-                err = 0;
-        } else if (warned++ == 0)
                printk(KERN_WARNING
-                       "lockd_up: makesock failed, error=%d\n", err);
+                        "lockd_up: makesock failed, error=%d\n", err);
        return err;
 }
@@ -252,7 +274,7 @@ int lockd_up(void)
                        "lockd_up: no pid, %d users??\n", nlmsvc_users);
        error = -ENOMEM;
-        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
+        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
        if (!serv) {
                printk(KERN_WARNING "lockd_up: create service failed\n");
                goto out;
@@ -276,6 +298,7 @@ int lockd_up(void)
        }
        svc_sock_update_bufs(serv);
+        serv->sv_maxconn = nlm_max_connections;
        nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
        if (IS_ERR(nlmsvc_task)) {
@@ -300,7 +323,7 @@ out:
        mutex_unlock(&nlmsvc_mutex);
        return error;
 }
-EXPORT_SYMBOL(lockd_up);
+EXPORT_SYMBOL_GPL(lockd_up);
 /*
 * Decrement the user count and bring down lockd if we're the last.
@@ -329,7 +352,7 @@ lockd_down(void)
 out:
        mutex_unlock(&nlmsvc_mutex);
 }
-EXPORT_SYMBOL(lockd_down);
+EXPORT_SYMBOL_GPL(lockd_down);
 #ifdef CONFIG_SYSCTL
@@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
 module_param_call(nlm_tcpport, param_set_port, param_get_int,
                  &nlm_tcpport, 0644);
 module_param(nsm_use_hostnames, bool, 0644);
+module_param(nlm_max_connections, uint, 0644);
 /*
 * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4dfdcbc6bf68..1725037374c5 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -419,8 +417,6 @@ static __be32
 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                                              void              *resp)
 {
-        struct sockaddr_in      saddr;
        dprintk("lockd: SM_NOTIFY     called\n");
        if (!nlm_privileged_requester(rqstp)) {
@@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                return rpc_system_err;
        }
-        /* Obtain the host pointer for this NFS server and try to
+        nlm_host_rebooted(argp);
-         * reclaim all locks we hold on this server.
-         */
-        memset(&saddr, 0, sizeof(saddr));
-        saddr.sin_family = AF_INET;
-        saddr.sin_addr.s_addr = argp->addr;
-        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
        return rpc_success;
 }
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3ca89e2a9381..3688e55901fc 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_CLIENT
@@ -451,8 +449,6 @@ static __be32
 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                                              void              *resp)
 {
-        struct sockaddr_in      saddr;
        dprintk("lockd: SM_NOTIFY     called\n");
        if (!nlm_privileged_requester(rqstp)) {
@@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
                return rpc_system_err;
        }
-        /* Obtain the host pointer for this NFS server and try to
+        nlm_host_rebooted(argp);
-         * reclaim all locks we hold on this server.
-         */
-        memset(&saddr, 0, sizeof(saddr));
-        saddr.sin_family = AF_INET;
-        saddr.sin_addr.s_addr = argp->addr;
-        nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
        return rpc_success;
 }
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 34c2766e27c7..9e4d6aab611b 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -17,7 +17,6 @@
 #include <linux/nfsd/export.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/module.h>
 #include <linux/mount.h>
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1f226290c67c..0336f2beacde 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_XDR
@@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
        if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
                return 0;
        argp->state = ntohl(*p++);
-        /* Preserve the address in network byte order */
+        memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
-        argp->addr = *p++;
+        p += XDR_QUADLEN(SM_PRIV_SIZE);
        return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 50c493a8ad8e..e1d528653192 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -17,7 +17,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #define NLMDBG_FACILITY         NLMDBG_XDR
@@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
        if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
                return 0;
        argp->state = ntohl(*p++);
-        /* Preserve the address in network byte order */
+        memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
-        argp->addr  = *p++;
+        p += XDR_QUADLEN(SM_PRIV_SIZE);
        return xdr_argsize_check(rqstp, p);
 }
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index f70433816a38..d4946c4c90e2 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
        return -EINVAL;
 got_it:
-        pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page);
+        pos = page_offset(page) + p - (char *)page_address(page);
        err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
                                        AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
        if (err)
diff --git a/fs/mpage.c b/fs/mpage.c
index 552b80b3facc..16c3ef37eae3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
                                first_hole = page_block;
                        page_block++;
                        block_in_file++;
-                        clear_buffer_mapped(map_bh);
                        continue;
                }
@@ -308,7 +307,10 @@ alloc_new:
                goto alloc_new;
        }
-        if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
+        relative_block = block_in_file - *first_logical_block;
+        nblocks = map_bh->b_size >> blkbits;
+        if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
+            (first_hole != blocks_per_page))
                bio = mpage_bio_submit(READ, bio);
        else
                *last_block_in_bio = blocks[blocks_per_page - 1];
diff --git a/fs/namei.c b/fs/namei.c
index af3783fff1de..f05bed242422 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -226,6 +226,16 @@ int generic_permission(struct inode *inode, int mask,
        return -EACCES;
 }
+/**
+ * inode_permission  -  check for access rights to a given inode
+ * @inode:      inode to check permission on
+ * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Used to check for read/write/execute permissions on an inode.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ */
 int inode_permission(struct inode *inode, int mask)
 {
        int retval;
@@ -247,8 +257,7 @@ int inode_permission(struct inode *inode, int mask)
                        return -EACCES;
        }
-        /* Ordinary permission routines do not understand MAY_APPEND. */
+        if (inode->i_op->permission)
-        if (inode->i_op && inode->i_op->permission)
                retval = inode->i_op->permission(inode, mask);
        else
                retval = generic_permission(inode, mask, NULL);
@@ -265,21 +274,6 @@ int inode_permission(struct inode *inode, int mask)
 }
 /**
- * vfs_permission  -  check for access rights to a given path
- * @nd:         lookup result that describes the path
- * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Used to check for read/write/execute permissions on a path.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things.
- */
-int vfs_permission(struct nameidata *nd, int mask)
-{
-        return inode_permission(nd->path.dentry->d_inode, mask);
-}
-/**
 * file_permission  -  check for additional access rights to a given file
 * @file:       file to check access rights for
 * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
@@ -289,7 +283,7 @@ int vfs_permission(struct nameidata *nd, int mask)
 *
 * Note:
 *      Do not use this function in new code.  All access checks should
- *      be done using vfs_permission().
+ *      be done using inode_permission().
 */
 int file_permission(struct file *file, int mask)
 {
@@ -438,7 +432,7 @@ static int exec_permission_lite(struct inode *inode)
 {
        umode_t mode = inode->i_mode;
-        if (inode->i_op && inode->i_op->permission)
+        if (inode->i_op->permission)
                return -EAGAIN;
        if (current_fsuid() == inode->i_uid)
@@ -527,18 +521,6 @@ out_unlock:
        return result;
 }
-/* SMP-safe */
-static __always_inline void
-walk_init_root(const char *name, struct nameidata *nd)
-{
-        struct fs_struct *fs = current->fs;
-        read_lock(&fs->lock);
-        nd->path = fs->root;
-        path_get(&fs->root);
-        read_unlock(&fs->lock);
-}
 /*
 * Wrapper to retry pathname resolution whenever the underlying
 * file system returns an ESTALE.
@@ -576,9 +558,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                goto fail;
        if (*link == '/') {
+                struct fs_struct *fs = current->fs;
                path_put(&nd->path);
-                walk_init_root(link, nd);
+                read_lock(&fs->lock);
+                nd->path = fs->root;
+                path_get(&fs->root);
+                read_unlock(&fs->lock);
        }
        res = link_path_walk(link, nd);
        if (nd->depth || res || nd->last_type!=LAST_NORM)
                return res;
@@ -859,7 +848,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                nd->flags |= LOOKUP_CONTINUE;
                err = exec_permission_lite(inode);
                if (err == -EAGAIN)
-                        err = vfs_permission(nd, MAY_EXEC);
+                        err = inode_permission(nd->path.dentry->d_inode,
+                                               MAY_EXEC);
                if (err)
                        break;
@@ -918,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                inode = next.dentry->d_inode;
                if (!inode)
                        goto out_dput;
-                err = -ENOTDIR; 
-                if (!inode->i_op)
-                        goto out_dput;
                if (inode->i_op->follow_link) {
                        err = do_follow_link(&next, nd);
@@ -930,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        inode = nd->path.dentry->d_inode;
                        if (!inode)
                                break;
-                        err = -ENOTDIR; 
-                        if (!inode->i_op)
-                                break;
                } else
                        path_to_nameidata(&next, nd);
                err = -ENOTDIR; 
@@ -971,7 +955,7 @@ last_component:
                        break;
                inode = next.dentry->d_inode;
                if ((lookup_flags & LOOKUP_FOLLOW)
-                    && inode && inode->i_op && inode->i_op->follow_link) {
+                    && inode && inode->i_op->follow_link) {
                        err = do_follow_link(&next, nd);
                        if (err)
                                goto return_err;
@@ -983,7 +967,7 @@ last_component:
                        break;
                if (lookup_flags & LOOKUP_DIRECTORY) {
                        err = -ENOTDIR; 
-                        if (!inode->i_op || !inode->i_op->lookup)
+                        if (!inode->i_op->lookup)
                                break;
                }
                goto return_base;
@@ -1479,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->create)
+        if (!dir->i_op->create)
                return -EACCES; /* shouldn't it be ENOSYS? */
        mode &= S_IALLUGO;
        mode |= S_IFREG;
@@ -1493,9 +1477,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
        return error;
 }
-int may_open(struct nameidata *nd, int acc_mode, int flag)
+int may_open(struct path *path, int acc_mode, int flag)
 {
-        struct dentry *dentry = nd->path.dentry;
+        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;
@@ -1516,13 +1500,13 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
        if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                flag &= ~O_TRUNC;
        } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
-                if (nd->path.mnt->mnt_flags & MNT_NODEV)
+                if (path->mnt->mnt_flags & MNT_NODEV)
                        return -EACCES;
                flag &= ~O_TRUNC;
        }
-        error = vfs_permission(nd, acc_mode);
+        error = inode_permission(inode, acc_mode);
        if (error)
                return error;
        /*
@@ -1556,6 +1540,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
                 * Refuse to truncate files with mandatory locks held on them.
                 */
                error = locks_verify_locked(inode);
+                if (!error)
+                        error = security_path_truncate(path, 0,
+                                               ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
                if (!error) {
                        DQUOT_INIT(inode);
@@ -1586,14 +1573,18 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
        if (!IS_POSIXACL(dir->d_inode))
                mode &= ~current->fs->umask;
+        error = security_path_mknod(&nd->path, path->dentry, mode, 0);
+        if (error)
+                goto out_unlock;
        error = vfs_create(dir->d_inode, path->dentry, mode, nd);
+out_unlock:
        mutex_unlock(&dir->d_inode->i_mutex);
        dput(nd->path.dentry);
        nd->path.dentry = path->dentry;
        if (error)
                return error;
        /* Don't check for write permission, don't truncate */
-        return may_open(nd, 0, flag & ~O_TRUNC);
+        return may_open(&nd->path, 0, flag & ~O_TRUNC);
 }
 /*
@@ -1755,7 +1746,7 @@ do_last:
        error = -ENOENT;
        if (!path.dentry->d_inode)
                goto exit_dput;
-        if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
+        if (path.dentry->d_inode->i_op->follow_link)
                goto do_link;
        path_to_nameidata(&path, &nd);
@@ -1779,7 +1770,7 @@ ok:
                if (error)
                        goto exit;
        }
-        error = may_open(&nd, acc_mode, flag);
+        error = may_open(&nd.path, acc_mode, flag);
        if (error) {
                if (will_write)
                        mnt_drop_write(nd.path.mnt);
@@ -1936,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
                return -EPERM;
-        if (!dir->i_op || !dir->i_op->mknod)
+        if (!dir->i_op->mknod)
                return -EPERM;
        error = devcgroup_inode_mknod(mode, dev);
@@ -1999,6 +1990,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_mknod(&nd.path, dentry, mode, dev);
+        if (error)
+                goto out_drop_write;
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
@@ -2011,6 +2005,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
                        error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
                        break;
        }
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
@@ -2034,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->mkdir)
+        if (!dir->i_op->mkdir)
                return -EPERM;
        mode &= (S_IRWXUGO|S_ISVTX);
@@ -2070,7 +2065,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_mkdir(&nd.path, dentry, mode);
+        if (error)
+                goto out_drop_write;
        error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
@@ -2121,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->rmdir)
+        if (!dir->i_op->rmdir)
                return -EPERM;
        DQUOT_INIT(dir);
@@ -2180,7 +2179,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto exit3;
+        error = security_path_rmdir(&nd.path, dentry);
+        if (error)
+                goto exit4;
        error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+exit4:
        mnt_drop_write(nd.path.mnt);
 exit3:
        dput(dentry);
@@ -2204,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->unlink)
+        if (!dir->i_op->unlink)
                return -EPERM;
        DQUOT_INIT(dir);
@@ -2265,7 +2268,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit2;
+                error = security_path_unlink(&nd.path, dentry);
+                if (error)
+                        goto exit3;
                error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+exit3:
                mnt_drop_write(nd.path.mnt);
        exit2:
                dput(dentry);
@@ -2307,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
        if (error)
                return error;
-        if (!dir->i_op || !dir->i_op->symlink)
+        if (!dir->i_op->symlink)
                return -EPERM;
        error = security_inode_symlink(dir, dentry, oldname);
@@ -2346,7 +2353,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_symlink(&nd.path, dentry, from);
+        if (error)
+                goto out_drop_write;
        error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(dentry);
@@ -2384,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
-        if (!dir->i_op || !dir->i_op->link)
+        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;
@@ -2443,7 +2454,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
        error = mnt_want_write(nd.path.mnt);
        if (error)
                goto out_dput;
+        error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+        if (error)
+                goto out_drop_write;
        error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+out_drop_write:
        mnt_drop_write(nd.path.mnt);
 out_dput:
        dput(new_dentry);
@@ -2587,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
-        if (!old_dir->i_op || !old_dir->i_op->rename)
+        if (!old_dir->i_op->rename)
                return -EPERM;
        DQUOT_INIT(old_dir);
@@ -2679,8 +2694,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
        error = mnt_want_write(oldnd.path.mnt);
        if (error)
                goto exit5;
+        error = security_path_rename(&oldnd.path, old_dentry,
+                                     &newnd.path, new_dentry);
+        if (error)
+                goto exit6;
        error = vfs_rename(old_dir->d_inode, old_dentry,
                                   new_dir->d_inode, new_dentry);
+exit6:
        mnt_drop_write(oldnd.path.mnt);
 exit5:
        dput(new_dentry);
@@ -2750,13 +2770,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link)
 /* get the link contents into pagecache */
 static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
-        struct page * page;
+        char *kaddr;
+        struct page *page;
        struct address_space *mapping = dentry->d_inode->i_mapping;
        page = read_mapping_page(mapping, 0, NULL);
        if (IS_ERR(page))
                return (char*)page;
        *ppage = page;
-        return kmap(page);
+        kaddr = kmap(page);
+        nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+        return kaddr;
 }
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
@@ -2788,18 +2811,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
        }
 }
-int __page_symlink(struct inode *inode, const char *symname, int len,
+/*
-                gfp_t gfp_mask)
+ * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+ */
+int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
 {
        struct address_space *mapping = inode->i_mapping;
        struct page *page;
        void *fsdata;
        int err;
        char *kaddr;
+        unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+        if (nofs)
+                flags |= AOP_FLAG_NOFS;
 retry:
        err = pagecache_write_begin(NULL, mapping, 0, len-1,
-                                AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+                                flags, &page, &fsdata);
        if (err)
                goto fail;
@@ -2823,7 +2851,7 @@ fail:
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
        return __page_symlink(inode, symname, len,
-                        mapping_gfp_mask(inode->i_mapping));
+                        !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
 }
 const struct inode_operations page_symlink_inode_operations = {
@@ -2849,7 +2877,6 @@ EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
 EXPORT_SYMBOL(vfs_create);
@@ -2865,3 +2892,10 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+        .count          = ATOMIC_INIT(1),
+        .lock           = __RW_LOCK_UNLOCKED(init_fs.lock),
+        .umask          = 0022,
+};
diff --git a/fs/namespace.c b/fs/namespace.c
index 1c09cab8f7cf..a40685d800a8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
        if (!new_ns->root) {
                up_write(&namespace_sem);
                kfree(new_ns);
-                return ERR_PTR(-ENOMEM);;
+                return ERR_PTR(-ENOMEM);
        }
        spin_lock(&vfsmount_lock);
        list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 335b003dddf9..0af3349de851 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -16,7 +16,6 @@
 *      @opts: an array of &struct option entries controlling parser operations
 *      @optopt: output; will contain the current option
 *      @optarg: output; will contain the value (if one exists)
- *      @flag: output; may be NULL; should point to a long for or'ing flags
 *      @value: output; may be NULL; will be overwritten with the integer value
 *              of the current argument.
 *
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6d04e050c74e..f54360f50a9c 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl
 {
        s32             auth_type;
        u32             object_name_len;
-        compat_caddr_t  object_name;    /* an userspace data, in most cases user name */
+        compat_caddr_t  object_name;    /* a userspace data, in most cases user name */
 };
 struct compat_ncp_fs_info_v2 {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c2e9cfd9e5a4..3e634f2a1083 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,6 +16,7 @@
 #include <linux/mutex.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/sunrpc/svcauth_gss.h>
 #include <net/inet_sock.h>
@@ -182,10 +183,34 @@ void nfs_callback_down(void)
        mutex_unlock(&nfs_callback_mutex);
 }
+static int check_gss_callback_principal(struct nfs_client *clp,
+                                        struct svc_rqst *rqstp)
+{
+        struct rpc_clnt *r = clp->cl_rpcclient;
+        char *p = svc_gss_principal(rqstp);
+        /*
+         * It might just be a normal user principal, in which case
+         * userspace won't bother to tell us the name at all.
+         */
+        if (p == NULL)
+                return SVC_DENIED;
+        /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
+        if (memcmp(p, "nfs@", 4) != 0)
+                return SVC_DENIED;
+        p += 4;
+        if (strcmp(p, r->cl_server) != 0)
+                return SVC_DENIED;
+        return SVC_OK;
+}
 static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 {
        struct nfs_client *clp;
        RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+        int ret = SVC_OK;
        /* Don't talk to strangers */
        clp = nfs_find_client(svc_addr(rqstp), 4);
@@ -194,21 +219,22 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
        dprintk("%s: %s NFSv4 callback!\n", __func__,
                        svc_print_addr(rqstp, buf, sizeof(buf)));
-        nfs_put_client(clp);
        switch (rqstp->rq_authop->flavour) {
                case RPC_AUTH_NULL:
                        if (rqstp->rq_proc != CB_NULL)
-                                return SVC_DENIED;
+                                ret = SVC_DENIED;
                        break;
                case RPC_AUTH_UNIX:
                        break;
                case RPC_AUTH_GSS:
-                        /* FIXME: RPCSEC_GSS handling? */
+                        ret = check_gss_callback_principal(clp, rqstp);
+                        break;
                default:
-                        return SVC_DENIED;
+                        ret = SVC_DENIED;
        }
-        return SVC_OK;
+        nfs_put_client(clp);
+        return ret;
 }
 /*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 7547600b6174..9b728f3565a1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -143,7 +143,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
        clp->cl_proto = cl_init->proto;
 #ifdef CONFIG_NFS_V4
-        init_rwsem(&clp->cl_sem);
        INIT_LIST_HEAD(&clp->cl_delegations);
        spin_lock_init(&clp->cl_lock);
        INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
@@ -224,31 +223,54 @@ void nfs_put_client(struct nfs_client *clp)
        }
 }
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-                                 const struct sockaddr_in *sa2)
+static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
 {
-        return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
+        switch (sa->sa_family) {
+                default:
+                        return NULL;
+                case AF_INET6:
+                        return &((const struct sockaddr_in6 *)sa)->sin6_addr;
+                        break;
+                case AF_INET:
+                        ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
+                                        addr_mapped);
+                        return addr_mapped;
+        }
 }
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1,
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-                                 const struct sockaddr_in6 *sa2)
+                const struct sockaddr *sa2)
+{
+        const struct in6_addr *addr1;
+        const struct in6_addr *addr2;
+        struct in6_addr addr1_mapped;
+        struct in6_addr addr2_mapped;
+        addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
+        if (likely(addr1 != NULL)) {
+                addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
+                if (likely(addr2 != NULL))
+                        return ipv6_addr_equal(addr1, addr2);
+        }
+        return 0;
+}
+#else
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+                                 const struct sockaddr_in *sa2)
 {
-        return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr);
+        return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
 }
 static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
                                 const struct sockaddr *sa2)
 {
-        switch (sa1->sa_family) {
+        if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
-        case AF_INET:
+                return 0;
-                return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
+        return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
-                                (const struct sockaddr_in *)sa2);
+                        (const struct sockaddr_in *)sa2);
-        case AF_INET6:
-                return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1,
-                                (const struct sockaddr_in6 *)sa2);
-        }
-        BUG();
 }
+#endif
 /*
 * Find a client by IP address and protocol version
@@ -270,8 +292,6 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
                if (clp->rpc_ops->version != nfsversion)
                        continue;
-                if (addr->sa_family != clap->sa_family)
-                        continue;
                /* Match only the IP address, not the port number */
                if (!nfs_sockaddr_match_ipaddr(addr, clap))
                        continue;
@@ -305,8 +325,6 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
                if (clp->rpc_ops->version != nfsvers)
                        continue;
-                if (sap->sa_family != clap->sa_family)
-                        continue;
                /* Match only the IP address, not the port number */
                if (!nfs_sockaddr_match_ipaddr(sap, clap))
                        continue;
@@ -470,7 +488,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 static int nfs_create_rpc_client(struct nfs_client *clp,
                                 const struct rpc_timeout *timeparms,
                                 rpc_authflavor_t flavor,
-                                 int flags)
+                                 int discrtry, int noresvport)
 {
        struct rpc_clnt         *clnt = NULL;
        struct rpc_create_args args = {
@@ -482,9 +500,13 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
                .program        = &nfs_program,
                .version        = clp->rpc_ops->version,
                .authflavor     = flavor,
-                .flags          = flags,
        };
+        if (discrtry)
+                args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+        if (noresvport)
+                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        if (!IS_ERR(clp->cl_rpcclient))
                return 0;
@@ -522,6 +544,8 @@ static int nfs_start_lockd(struct nfs_server *server)
                .protocol       = server->flags & NFS_MOUNT_TCP ?
                                                IPPROTO_TCP : IPPROTO_UDP,
                .nfs_version    = clp->rpc_ops->version,
+                .noresvport     = server->flags & NFS_MOUNT_NORESVPORT ?
+                                        1 : 0,
        };
        if (nlm_init.nfs_version > 3)
@@ -623,7 +647,8 @@ static int nfs_init_client(struct nfs_client *clp,
         * Create a client RPC handle for doing FSSTAT with UNIX auth only
         * - RFC 2623, sec 2.3.2
         */
-        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0);
+        error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
+                                      0, data->flags & NFS_MOUNT_NORESVPORT);
        if (error < 0)
                goto error;
        nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -965,7 +990,8 @@ error:
 static int nfs4_init_client(struct nfs_client *clp,
                const struct rpc_timeout *timeparms,
                const char *ip_addr,
-                rpc_authflavor_t authflavour)
+                rpc_authflavor_t authflavour,
+                int flags)
 {
        int error;
@@ -979,7 +1005,7 @@ static int nfs4_init_client(struct nfs_client *clp,
        clp->rpc_ops = &nfs_v4_clientops;
        error = nfs_create_rpc_client(clp, timeparms, authflavour,
-                                        RPC_CLNT_CREATE_DISCRTRY);
+                                      1, flags & NFS_MOUNT_NORESVPORT);
        if (error < 0)
                goto error;
        memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1030,7 +1056,8 @@ static int nfs4_set_client(struct nfs_server *server,
                error = PTR_ERR(clp);
                goto error;
        }
-        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour);
+        error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
+                                        server->flags);
        if (error < 0)
                goto error_put;
@@ -1059,6 +1086,10 @@ static int nfs4_init_server(struct nfs_server *server,
        nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
                        data->timeo, data->retrans);
+        /* Initialise the client representation from the mount data */
+        server->flags = data->flags;
+        server->caps |= NFS_CAP_ATOMIC_OPEN;
        /* Get a client record */
        error = nfs4_set_client(server,
                        data->nfs_server.hostname,
@@ -1071,10 +1102,6 @@ static int nfs4_init_server(struct nfs_server *server,
        if (error < 0)
                goto error;
-        /* Initialise the client representation from the mount data */
-        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN;
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
        if (data->wsize)
@@ -1177,6 +1204,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        parent_server = NFS_SB(data->sb);
        parent_client = parent_server->nfs_client;
+        /* Initialise the client representation from the parent server */
+        nfs_server_copy_userdata(server, parent_server);
+        server->caps |= NFS_CAP_ATOMIC_OPEN;
        /* Get a client representation.
         * Note: NFSv4 always uses TCP, */
        error = nfs4_set_client(server, data->hostname,
@@ -1189,10 +1220,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        if (error < 0)
                goto error;
-        /* Initialise the client representation from the parent server */
-        nfs_server_copy_userdata(server, parent_server);
-        server->caps |= NFS_CAP_ATOMIC_OPEN;
        error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
        if (error < 0)
                goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cc563cfa6940..968225a88015 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -43,6 +43,27 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
                put_rpccred(cred);
 }
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
+{
+        set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
+}
+int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+        struct nfs_delegation *delegation;
+        int ret = 0;
+        flags &= FMODE_READ|FMODE_WRITE;
+        rcu_read_lock();
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
+        if (delegation != NULL && (delegation->type & flags) == flags) {
+                nfs_mark_delegation_referenced(delegation);
+                ret = 1;
+        }
+        rcu_read_unlock();
+        return ret;
+}
 static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
 {
        struct inode *inode = state->inode;
@@ -119,7 +140,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
        delegation->maxsize = res->maxsize;
        oldcred = delegation->cred;
        delegation->cred = get_rpccred(cred);
-        delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM;
+        clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
        NFS_I(inode)->delegation_state = delegation->type;
        smp_wmb();
        put_rpccred(oldcred);
@@ -134,19 +155,35 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
        return res;
 }
+static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
+{
+        struct inode *inode = NULL;
+        spin_lock(&delegation->lock);
+        if (delegation->inode != NULL)
+                inode = igrab(delegation->inode);
+        spin_unlock(&delegation->lock);
+        return inode;
+}
 static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
 {
        struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
        if (delegation == NULL)
                goto nomatch;
+        spin_lock(&delegation->lock);
        if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
                                sizeof(delegation->stateid.data)) != 0)
-                goto nomatch;
+                goto nomatch_unlock;
        list_del_rcu(&delegation->super_list);
+        delegation->inode = NULL;
        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
+        spin_unlock(&delegation->lock);
        return delegation;
+nomatch_unlock:
+        spin_unlock(&delegation->lock);
 nomatch:
        return NULL;
 }
@@ -172,6 +209,8 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        delegation->change_attr = nfsi->change_attr;
        delegation->cred = get_rpccred(cred);
        delegation->inode = inode;
+        delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+        spin_lock_init(&delegation->lock);
        spin_lock(&clp->cl_lock);
        if (rcu_dereference(nfsi->delegation) != NULL) {
@@ -226,22 +265,47 @@ static void nfs_msync_inode(struct inode *inode)
 */
 static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
 {
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        nfs_msync_inode(inode);
-        down_read(&clp->cl_sem);
        /* Guard against new delegated open calls */
        down_write(&nfsi->rwsem);
        nfs_delegation_claim_opens(inode, &delegation->stateid);
        up_write(&nfsi->rwsem);
-        up_read(&clp->cl_sem);
        nfs_msync_inode(inode);
        return nfs_do_return_delegation(inode, delegation, 1);
 }
 /*
+ * Return all delegations that have been marked for return
+ */
+void nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+        struct nfs_delegation *delegation;
+        struct inode *inode;
+restart:
+        rcu_read_lock();
+        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+                if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+                        continue;
+                inode = nfs_delegation_grab_inode(delegation);
+                if (inode == NULL)
+                        continue;
+                spin_lock(&clp->cl_lock);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                spin_unlock(&clp->cl_lock);
+                rcu_read_unlock();
+                if (delegation != NULL)
+                        __nfs_inode_return_delegation(inode, delegation);
+                iput(inode);
+                goto restart;
+        }
+        rcu_read_unlock();
+}
+/*
 * This function returns the delegation without reclaiming opens
 * or protecting against delegation reclaims.
 * It is therefore really only safe to be called from
@@ -279,83 +343,55 @@ int nfs_inode_return_delegation(struct inode *inode)
        return err;
 }
+static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+{
+        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+}
 /*
 * Return all delegations associated to a super block
 */
-void nfs_return_all_delegations(struct super_block *sb)
+void nfs_super_return_all_delegations(struct super_block *sb)
 {
        struct nfs_client *clp = NFS_SB(sb)->nfs_client;
        struct nfs_delegation *delegation;
-        struct inode *inode;
        if (clp == NULL)
                return;
-restart:
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                if (delegation->inode->i_sb != sb)
+                spin_lock(&delegation->lock);
-                        continue;
+                if (delegation->inode != NULL && delegation->inode->i_sb == sb)
-                inode = igrab(delegation->inode);
+                        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                if (inode == NULL)
+                spin_unlock(&delegation->lock);
-                        continue;
-                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
-                spin_unlock(&clp->cl_lock);
-                rcu_read_unlock();
-                if (delegation != NULL)
-                        __nfs_inode_return_delegation(inode, delegation);
-                iput(inode);
-                goto restart;
        }
        rcu_read_unlock();
+        nfs_client_return_marked_delegations(clp);
 }
-static int nfs_do_expire_all_delegations(void *ptr)
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
 {
-        struct nfs_client *clp = ptr;
        struct nfs_delegation *delegation;
-        struct inode *inode;
-        allow_signal(SIGKILL);
-restart:
-        if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0)
-                goto out;
-        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0)
-                goto out;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                inode = igrab(delegation->inode);
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                if (inode == NULL)
+                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                        continue;
-                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
-                spin_unlock(&clp->cl_lock);
-                rcu_read_unlock();
-                if (delegation)
-                        __nfs_inode_return_delegation(inode, delegation);
-                iput(inode);
-                goto restart;
        }
        rcu_read_unlock();
-out:
+}
-        nfs_put_client(clp);
-        module_put_and_exit(0);
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+        if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+                nfs4_schedule_state_manager(clp);
 }
 void nfs_expire_all_delegations(struct nfs_client *clp)
 {
-        struct task_struct *task;
+        nfs_client_mark_return_all_delegations(clp);
+        nfs_delegation_run_state_manager(clp);
-        __module_get(THIS_MODULE);
-        atomic_inc(&clp->cl_count);
-        task = kthread_run(nfs_do_expire_all_delegations, clp,
-                                "%s-delegreturn",
-                                rpc_peeraddr2str(clp->cl_rpcclient,
-                                                        RPC_DISPLAY_ADDR));
-        if (!IS_ERR(task))
-                return;
-        nfs_put_client(clp);
-        module_put(THIS_MODULE);
 }
 /*
@@ -363,68 +399,29 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
 */
 void nfs_handle_cb_pathdown(struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation;
-        struct inode *inode;
        if (clp == NULL)
                return;
-restart:
+        nfs_client_mark_return_all_delegations(clp);
+}
+static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+{
+        struct nfs_delegation *delegation;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                inode = igrab(delegation->inode);
+                if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
-                if (inode == NULL)
                        continue;
-                spin_lock(&clp->cl_lock);
+                set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
-                spin_unlock(&clp->cl_lock);
-                rcu_read_unlock();
-                if (delegation != NULL)
-                        __nfs_inode_return_delegation(inode, delegation);
-                iput(inode);
-                goto restart;
        }
        rcu_read_unlock();
 }
-struct recall_threadargs {
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
-        struct inode *inode;
-        struct nfs_client *clp;
-        const nfs4_stateid *stateid;
-        struct completion started;
-        int result;
-};
-static int recall_thread(void *data)
 {
-        struct recall_threadargs *args = (struct recall_threadargs *)data;
+        nfs_client_mark_return_unreferenced_delegations(clp);
-        struct inode *inode = igrab(args->inode);
+        nfs_delegation_run_state_manager(clp);
-        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
-        struct nfs_inode *nfsi = NFS_I(inode);
-        struct nfs_delegation *delegation;
-        daemonize("nfsv4-delegreturn");
-        nfs_msync_inode(inode);
-        down_read(&clp->cl_sem);
-        down_write(&nfsi->rwsem);
-        spin_lock(&clp->cl_lock);
-        delegation = nfs_detach_delegation_locked(nfsi, args->stateid);
-        if (delegation != NULL)
-                args->result = 0;
-        else
-                args->result = -ENOENT;
-        spin_unlock(&clp->cl_lock);
-        complete(&args->started);
-        nfs_delegation_claim_opens(inode, args->stateid);
-        up_write(&nfsi->rwsem);
-        up_read(&clp->cl_sem);
-        nfs_msync_inode(inode);
-        if (delegation != NULL)
-                nfs_do_return_delegation(inode, delegation, 1);
-        iput(inode);
-        module_put_and_exit(0);
 }
 /*
@@ -432,22 +429,20 @@ static int recall_thread(void *data)
 */
 int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
 {
-        struct recall_threadargs data = {
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
-                .inode = inode,
+        struct nfs_delegation *delegation;
-                .stateid = stateid,
-        };
-        int status;
-        init_completion(&data.started);
+        rcu_read_lock();
-        __module_get(THIS_MODULE);
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        status = kernel_thread(recall_thread, &data, CLONE_KERNEL);
+        if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
-        if (status < 0)
+                                sizeof(delegation->stateid.data)) != 0) {
-                goto out_module_put;
+                rcu_read_unlock();
-        wait_for_completion(&data.started);
+                return -ENOENT;
-        return data.result;
+        }
-out_module_put:
+        nfs_mark_return_delegation(clp, delegation);
-        module_put(THIS_MODULE);
+        rcu_read_unlock();
-        return status;
+        nfs_delegation_run_state_manager(clp);
+        return 0;
 }
 /*
@@ -459,10 +454,14 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
        struct inode *res = NULL;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
+                spin_lock(&delegation->lock);
+                if (delegation->inode != NULL &&
+                    nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
                        res = igrab(delegation->inode);
-                        break;
                }
+                spin_unlock(&delegation->lock);
+                if (res != NULL)
+                        break;
        }
        rcu_read_unlock();
        return res;
@@ -476,7 +475,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
        struct nfs_delegation *delegation;
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
-                delegation->flags |= NFS_DELEGATION_NEED_RECLAIM;
+                set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
        rcu_read_unlock();
 }
@@ -486,17 +485,22 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
 {
        struct nfs_delegation *delegation;
+        struct inode *inode;
 restart:
        rcu_read_lock();
        list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
-                if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0)
+                if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
+                        continue;
+                inode = nfs_delegation_grab_inode(delegation);
+                if (inode == NULL)
                        continue;
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(delegation->inode), NULL);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
                if (delegation != NULL)
                        nfs_free_delegation(delegation);
+                iput(inode);
                goto restart;
        }
        rcu_read_unlock();
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index f1c5e2a5d88e..09f383795174 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -17,14 +17,20 @@ struct nfs_delegation {
        struct rpc_cred *cred;
        struct inode *inode;
        nfs4_stateid stateid;
-        int type;
+        fmode_t type;
-#define NFS_DELEGATION_NEED_RECLAIM 1
-        long flags;
        loff_t maxsize;
        __u64 change_attr;
+        unsigned long flags;
+        spinlock_t lock;
        struct rcu_head rcu;
 };
+enum {
+        NFS_DELEGATION_NEED_RECLAIM = 0,
+        NFS_DELEGATION_RETURN,
+        NFS_DELEGATION_REFERENCED,
+};
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
 int nfs_inode_return_delegation(struct inode *inode);
@@ -32,9 +38,11 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
 void nfs_inode_return_delegation_noreclaim(struct inode *inode);
 struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
-void nfs_return_all_delegations(struct super_block *sb);
+void nfs_super_return_all_delegations(struct super_block *sb);
 void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 void nfs_handle_cb_pathdown(struct nfs_client *clp);
+void nfs_client_return_marked_delegations(struct nfs_client *clp);
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -45,22 +53,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
 int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
-static inline int nfs_have_delegation(struct inode *inode, int flags)
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
-{
+int nfs_have_delegation(struct inode *inode, fmode_t flags);
-        struct nfs_delegation *delegation;
-        int ret = 0;
-        flags &= FMODE_READ|FMODE_WRITE;
-        rcu_read_lock();
-        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (delegation != NULL && (delegation->type & flags) == flags)
-                ret = 1;
-        rcu_read_unlock();
-        return ret;
-}
 #else
-static inline int nfs_have_delegation(struct inode *inode, int flags)
+static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
 {
        return 0;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3e64b98f3a93..e35c8199f82f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -799,6 +799,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
                goto out_bad;
        }
+        if (nfs_have_delegation(inode, FMODE_READ))
+                goto out_set_verifier;
        /* Force a full look up iff the parent directory has changed */
        if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
                if (nfs_lookup_verify_inode(inode, nd))
@@ -817,6 +820,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
        if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
                goto out_bad;
+out_set_verifier:
        nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_valid:
        dput(parent);
@@ -973,7 +977,7 @@ struct dentry_operations nfs4_dentry_operations = {
 * Use intent information to determine whether we need to substitute
 * the NFSv4-style stateful OPEN for the LOOKUP call
 */
-static int is_atomic_open(struct inode *dir, struct nameidata *nd)
+static int is_atomic_open(struct nameidata *nd)
 {
        if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0)
                return 0;
@@ -996,7 +1000,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                        dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
        /* Check that we are indeed trying to open this file */
-        if (!is_atomic_open(dir, nd))
+        if (!is_atomic_open(nd))
                goto no_open;
        if (dentry->d_name.len > NFS_SERVER(dir)->namelen) {
@@ -1047,10 +1051,10 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct inode *dir;
        int openflags, ret = 0;
+        if (!is_atomic_open(nd))
+                goto no_open;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
-        if (!is_atomic_open(dir, nd))
-                goto no_open;
        /* We can't create new files in nfs_open_revalidate(), so we
         * optimize away revalidation of negative dentries.
         */
@@ -1062,11 +1066,11 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* NFS only supports OPEN on regular files */
        if (!S_ISREG(inode->i_mode))
-                goto no_open;
+                goto no_open_dput;
        openflags = nd->intent.open.flags;
        /* We cannot do exclusive creation on a positive dentry */
        if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
-                goto no_open;
+                goto no_open_dput;
        /* We can't create new files, or truncate existing ones here */
        openflags &= ~(O_CREAT|O_TRUNC);
@@ -1081,10 +1085,9 @@ out:
        if (!ret)
                d_drop(dentry);
        return ret;
-no_open:
+no_open_dput:
        dput(parent);
-        if (inode != NULL && nfs_have_delegation(inode, FMODE_READ))
+no_open:
-                return 1;
        return nfs_lookup_revalidate(dentry, nd);
 }
 #endif /* CONFIG_NFSV4 */
@@ -1794,7 +1797,8 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
        cache = nfs_access_search_rbtree(inode, cred);
        if (cache == NULL)
                goto out;
-        if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+        if (!nfs_have_delegation(inode, FMODE_READ) &&
+            !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
                goto out_stale;
        res->jiffies = cache->jiffies;
        res->cred = cache->cred;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d319b49f8f06..90f292b520d2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d22eb383e1cf..0c381686171e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -592,7 +592,7 @@ static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context
 /*
 * Given an inode, search for an open context with the desired characteristics
 */
-struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode)
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_open_context *pos, *ctx = NULL;
@@ -712,14 +712,7 @@ int nfs_attribute_timeout(struct inode *inode)
        if (nfs_have_delegation(inode, FMODE_READ))
                return 0;
-        /*
+        return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
-         * Special case: if the attribute timeout is set to 0, then always
-         *               treat the cache as having expired (unless holding
-         *               a delegation).
-         */
-        if (nfsi->attrtimeo == 0)
-                return 1;
-        return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 /**
@@ -1182,7 +1175,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                nfsi->attrtimeo_timestamp = now;
                nfsi->attr_gencount = nfs_inc_attr_generation_counter();
        } else {
-                if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+                if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
                        if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
                                nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
                        nfsi->attrtimeo_timestamp = now;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d212ee41caf2..340ede8f608f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,20 @@ struct nfs_parsed_mount_data {
        struct security_mnt_opts lsm_opts;
 };
+/* mount_clnt.c */
+struct nfs_mount_request {
+        struct sockaddr         *sap;
+        size_t                  salen;
+        char                    *hostname;
+        char                    *dirpath;
+        u32                     version;
+        unsigned short          protocol;
+        struct nfs_fh           *fh;
+        int                     noresvport;
+};
+extern int nfs_mount(struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 086a6830d785..ca905a5bb1ba 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -29,47 +29,43 @@ struct mnt_fhstatus {
 /**
 * nfs_mount - Obtain an NFS file handle for the given host and path
- * @addr: pointer to server's address
+ * @info: pointer to mount request arguments
- * @len: size of server's address
- * @hostname: name of server host, or NULL
- * @path: pointer to string containing export path to mount
- * @version: mount version to use for this request
- * @protocol: transport protocol to use for thie request
- * @fh: pointer to location to place returned file handle
 *
 * Uses default timeout parameters specified by underlying transport.
 */
-int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path,
+int nfs_mount(struct nfs_mount_request *info)
-              int version, int protocol, struct nfs_fh *fh)
 {
        struct mnt_fhstatus     result = {
-                .fh             = fh
+                .fh             = info->fh
        };
        struct rpc_message msg  = {
-                .rpc_argp       = path,
+                .rpc_argp       = info->dirpath,
                .rpc_resp       = &result,
        };
        struct rpc_create_args args = {
-                .protocol       = protocol,
+                .protocol       = info->protocol,
-                .address        = addr,
+                .address        = info->sap,
-                .addrsize       = len,
+                .addrsize       = info->salen,
-                .servername     = hostname,
+                .servername     = info->hostname,
                .program        = &mnt_program,
-                .version        = version,
+                .version        = info->version,
                .authflavor     = RPC_AUTH_UNIX,
-                .flags          = 0,
        };
        struct rpc_clnt         *mnt_clnt;
        int                     status;
        dprintk("NFS: sending MNT request for %s:%s\n",
-                (hostname ? hostname : "server"), path);
+                (info->hostname ? info->hostname : "server"),
+                        info->dirpath);
+        if (info->noresvport)
+                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
        mnt_clnt = rpc_create(&args);
        if (IS_ERR(mnt_clnt))
                goto out_clnt_err;
-        if (version == NFS_MNT3_VERSION)
+        if (info->version == NFS_MNT3_VERSION)
                msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
        else
                msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea790645fda6..4e4d33204376 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -38,8 +38,12 @@ struct idmap;
 ((err) != NFSERR_NOFILEHANDLE))
 enum nfs4_client_state {
-        NFS4CLNT_STATE_RECOVER  = 0,
+        NFS4CLNT_MANAGER_RUNNING  = 0,
+        NFS4CLNT_CHECK_LEASE,
        NFS4CLNT_LEASE_EXPIRED,
+        NFS4CLNT_RECLAIM_REBOOT,
+        NFS4CLNT_RECLAIM_NOGRACE,
+        NFS4CLNT_DELEGRETURN,
 };
 /*
@@ -90,12 +94,18 @@ struct nfs4_state_owner {
        spinlock_t           so_lock;
        atomic_t             so_count;
+        unsigned long        so_flags;
        struct list_head     so_states;
        struct list_head     so_delegations;
        struct nfs_seqid_counter so_seqid;
        struct rpc_sequence  so_sequence;
 };
+enum {
+        NFS_OWNER_RECLAIM_REBOOT,
+        NFS_OWNER_RECLAIM_NOGRACE
+};
 /*
 * struct nfs4_state maintains the client-side state for a given
 * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
@@ -128,6 +138,8 @@ enum {
        NFS_O_RDONLY_STATE,             /* OPEN stateid has read-only state */
        NFS_O_WRONLY_STATE,             /* OPEN stateid has write-only state */
        NFS_O_RDWR_STATE,               /* OPEN stateid has read/write state */
+        NFS_STATE_RECLAIM_REBOOT,       /* OPEN stateid server rebooted */
+        NFS_STATE_RECLAIM_NOGRACE,      /* OPEN stateid needs to recover state */
 };
 struct nfs4_state {
@@ -149,7 +161,7 @@ struct nfs4_state {
        unsigned int n_rdonly;          /* Number of read-only references */
        unsigned int n_wronly;          /* Number of write-only references */
        unsigned int n_rdwr;            /* Number of read/write references */
-        int state;                      /* State on the server (R,W, or RW) */
+        fmode_t state;                  /* State on the server (R,W, or RW) */
        atomic_t count;
 };
@@ -157,9 +169,12 @@ struct nfs4_state {
 struct nfs4_exception {
        long timeout;
        int retry;
+        struct nfs4_state *state;
 };
 struct nfs4_state_recovery_ops {
+        int owner_flag_bit;
+        int state_flag_bit;
        int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
        int (*recover_lock)(struct nfs4_state *, struct file_lock *);
 };
@@ -174,7 +189,6 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
 /* nfs4proc.c */
-extern int nfs4_map_errors(int err);
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
@@ -187,7 +201,7 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page);
 extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
-extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
+extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
 extern const u32 nfs4_fattr_bitmap[2];
 extern const u32 nfs4_statfs_bitmap[2];
@@ -202,16 +216,18 @@ extern void nfs4_kill_renewd(struct nfs_client *);
 extern void nfs4_renew_state(struct work_struct *);
 /* nfs4state.c */
-struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
-extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t);
+extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
-extern void nfs4_close_sync(struct path *, struct nfs4_state *, mode_t);
+extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
-extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t);
+extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 83e700a2b0c0..8dde84b988d9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -62,14 +62,12 @@
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
 /* Prevent leaks of NFSv4 errors into userland */
-int nfs4_map_errors(int err)
+static int nfs4_map_errors(int err)
 {
        if (err < -1000) {
                dprintk("%s could not handle NFSv4 error %d\n",
@@ -195,6 +193,83 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
        kunmap_atomic(start, KM_USER0);
 }
+static int nfs4_wait_bit_killable(void *word)
+{
+        if (fatal_signal_pending(current))
+                return -ERESTARTSYS;
+        schedule();
+        return 0;
+}
+static int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+        int res;
+        might_sleep();
+        res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+                        nfs4_wait_bit_killable, TASK_KILLABLE);
+        return res;
+}
+static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
+{
+        int res = 0;
+        might_sleep();
+        if (*timeout <= 0)
+                *timeout = NFS4_POLL_RETRY_MIN;
+        if (*timeout > NFS4_POLL_RETRY_MAX)
+                *timeout = NFS4_POLL_RETRY_MAX;
+        schedule_timeout_killable(*timeout);
+        if (fatal_signal_pending(current))
+                res = -ERESTARTSYS;
+        *timeout <<= 1;
+        return res;
+}
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+        struct nfs_client *clp = server->nfs_client;
+        struct nfs4_state *state = exception->state;
+        int ret = errorcode;
+        exception->retry = 0;
+        switch(errorcode) {
+                case 0:
+                        return 0;
+                case -NFS4ERR_ADMIN_REVOKED:
+                case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_OPENMODE:
+                        if (state == NULL)
+                                break;
+                        nfs4_state_mark_reclaim_nograce(clp, state);
+                case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_STALE_STATEID:
+                case -NFS4ERR_EXPIRED:
+                        nfs4_schedule_state_recovery(clp);
+                        ret = nfs4_wait_clnt_recover(clp);
+                        if (ret == 0)
+                                exception->retry = 1;
+                        break;
+                case -NFS4ERR_FILE_OPEN:
+                case -NFS4ERR_GRACE:
+                case -NFS4ERR_DELAY:
+                        ret = nfs4_delay(server->client, &exception->timeout);
+                        if (ret != 0)
+                                break;
+                case -NFS4ERR_OLD_STATEID:
+                        exception->retry = 1;
+        }
+        /* We failed to handle the error */
+        return nfs4_map_errors(ret);
+}
 static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
 {
        struct nfs_client *clp = server->nfs_client;
@@ -248,7 +323,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 }
 static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
-                struct nfs4_state_owner *sp, int flags,
+                struct nfs4_state_owner *sp, fmode_t fmode, int flags,
                const struct iattr *attrs)
 {
        struct dentry *parent = dget_parent(path->dentry);
@@ -268,7 +343,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
        p->owner = sp;
        atomic_inc(&sp->so_count);
        p->o_arg.fh = NFS_FH(dir);
-        p->o_arg.open_flags = flags,
+        p->o_arg.open_flags = flags;
+        p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
        p->o_arg.clientid = server->nfs_client->cl_clientid;
        p->o_arg.id = sp->so_owner_id.id;
        p->o_arg.name = &p->path.dentry->d_name;
@@ -324,10 +400,13 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
        return ret;
 }
-static int can_open_cached(struct nfs4_state *state, int mode)
+static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
 {
        int ret = 0;
-        switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) {
+        if (open_mode & O_EXCL)
+                goto out;
+        switch (mode & (FMODE_READ|FMODE_WRITE)) {
                case FMODE_READ:
                        ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
                        break;
@@ -337,21 +416,23 @@ static int can_open_cached(struct nfs4_state *state, int mode)
                case FMODE_READ|FMODE_WRITE:
                        ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
        }
+out:
        return ret;
 }
-static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
 {
-        if ((delegation->type & open_flags) != open_flags)
+        if ((delegation->type & fmode) != fmode)
                return 0;
-        if (delegation->flags & NFS_DELEGATION_NEED_RECLAIM)
+        if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
                return 0;
+        nfs_mark_delegation_referenced(delegation);
        return 1;
 }
-static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
+static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
 {
-        switch (open_flags) {
+        switch (fmode) {
                case FMODE_WRITE:
                        state->n_wronly++;
                        break;
@@ -361,15 +442,15 @@ static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
                case FMODE_READ|FMODE_WRITE:
                        state->n_rdwr++;
        }
-        nfs4_state_set_mode_locked(state, state->state | open_flags);
+        nfs4_state_set_mode_locked(state, state->state | fmode);
 }
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
 {
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
        memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
-        switch (open_flags) {
+        switch (fmode) {
                case FMODE_READ:
                        set_bit(NFS_O_RDONLY_STATE, &state->flags);
                        break;
@@ -381,16 +462,15 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
        }
 }
-static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
 {
        write_seqlock(&state->seqlock);
-        nfs_set_open_stateid_locked(state, stateid, open_flags);
+        nfs_set_open_stateid_locked(state, stateid, fmode);
        write_sequnlock(&state->seqlock);
 }
-static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags)
+static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
 {
-        open_flags &= (FMODE_READ|FMODE_WRITE);
        /*
         * Protect the call to nfs4_state_set_mode_locked and
         * serialise the stateid update
@@ -401,20 +481,60 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_sta
                set_bit(NFS_DELEGATED_STATE, &state->flags);
        }
        if (open_stateid != NULL)
-                nfs_set_open_stateid_locked(state, open_stateid, open_flags);
+                nfs_set_open_stateid_locked(state, open_stateid, fmode);
        write_sequnlock(&state->seqlock);
        spin_lock(&state->owner->so_lock);
-        update_open_stateflags(state, open_flags);
+        update_open_stateflags(state, fmode);
        spin_unlock(&state->owner->so_lock);
 }
-static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags)
+static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+{
+        struct nfs_inode *nfsi = NFS_I(state->inode);
+        struct nfs_delegation *deleg_cur;
+        int ret = 0;
+        fmode &= (FMODE_READ|FMODE_WRITE);
+        rcu_read_lock();
+        deleg_cur = rcu_dereference(nfsi->delegation);
+        if (deleg_cur == NULL)
+                goto no_delegation;
+        spin_lock(&deleg_cur->lock);
+        if (nfsi->delegation != deleg_cur ||
+            (deleg_cur->type & fmode) != fmode)
+                goto no_delegation_unlock;
+        if (delegation == NULL)
+                delegation = &deleg_cur->stateid;
+        else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0)
+                goto no_delegation_unlock;
+        nfs_mark_delegation_referenced(deleg_cur);
+        __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+        ret = 1;
+no_delegation_unlock:
+        spin_unlock(&deleg_cur->lock);
+no_delegation:
+        rcu_read_unlock();
+        if (!ret && open_stateid != NULL) {
+                __update_open_stateid(state, open_stateid, NULL, fmode);
+                ret = 1;
+        }
+        return ret;
+}
+static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
 {
        struct nfs_delegation *delegation;
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (delegation == NULL || (delegation->type & open_flags) == open_flags) {
+        if (delegation == NULL || (delegation->type & fmode) == fmode) {
                rcu_read_unlock();
                return;
        }
@@ -427,27 +547,28 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
        struct nfs4_state *state = opendata->state;
        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_delegation *delegation;
-        int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL);
+        int open_mode = opendata->o_arg.open_flags & O_EXCL;
+        fmode_t fmode = opendata->o_arg.fmode;
        nfs4_stateid stateid;
        int ret = -EAGAIN;
-        rcu_read_lock();
-        delegation = rcu_dereference(nfsi->delegation);
        for (;;) {
-                if (can_open_cached(state, open_mode)) {
+                if (can_open_cached(state, fmode, open_mode)) {
                        spin_lock(&state->owner->so_lock);
-                        if (can_open_cached(state, open_mode)) {
+                        if (can_open_cached(state, fmode, open_mode)) {
-                                update_open_stateflags(state, open_mode);
+                                update_open_stateflags(state, fmode);
                                spin_unlock(&state->owner->so_lock);
-                                rcu_read_unlock();
                                goto out_return_state;
                        }
                        spin_unlock(&state->owner->so_lock);
                }
-                if (delegation == NULL)
+                rcu_read_lock();
-                        break;
+                delegation = rcu_dereference(nfsi->delegation);
-                if (!can_open_delegated(delegation, open_mode))
+                if (delegation == NULL ||
+                    !can_open_delegated(delegation, fmode)) {
+                        rcu_read_unlock();
                        break;
+                }
                /* Save the delegation */
                memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
                rcu_read_unlock();
@@ -455,19 +576,11 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
                if (ret != 0)
                        goto out;
                ret = -EAGAIN;
-                rcu_read_lock();
-                delegation = rcu_dereference(nfsi->delegation);
+                /* Try to update the stateid using the delegation */
-                /* If no delegation, try a cached open */
+                if (update_open_stateid(state, NULL, &stateid, fmode))
-                if (delegation == NULL)
+                        goto out_return_state;
-                        continue;
-                /* Is the delegation still valid? */
-                if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0)
-                        continue;
-                rcu_read_unlock();
-                update_open_stateid(state, NULL, &stateid, open_mode);
-                goto out_return_state;
        }
-        rcu_read_unlock();
 out:
        return ERR_PTR(ret);
 out_return_state:
@@ -480,7 +593,6 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
        struct inode *inode;
        struct nfs4_state *state = NULL;
        struct nfs_delegation *delegation;
-        nfs4_stateid *deleg_stateid = NULL;
        int ret;
        if (!data->rpc_done) {
@@ -507,7 +619,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
                if (delegation)
                        delegation_flags = delegation->flags;
                rcu_read_unlock();
-                if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM))
+                if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
                        nfs_inode_set_delegation(state->inode,
                                        data->owner->so_cred,
                                        &data->o_res);
@@ -516,12 +628,9 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
                                        data->owner->so_cred,
                                        &data->o_res);
        }
-        rcu_read_lock();
-        delegation = rcu_dereference(NFS_I(inode)->delegation);
+        update_open_stateid(state, &data->o_res.stateid, NULL,
-        if (delegation != NULL)
+                        data->o_arg.fmode);
-                deleg_stateid = &delegation->stateid;
-        update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags);
-        rcu_read_unlock();
        iput(inode);
 out:
        return state;
@@ -552,7 +661,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
 {
        struct nfs4_opendata *opendata;
-        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL);
+        opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
        if (opendata == NULL)
                return ERR_PTR(-ENOMEM);
        opendata->state = state;
@@ -560,12 +669,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
        return opendata;
 }
-static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res)
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
 {
        struct nfs4_state *newstate;
        int ret;
-        opendata->o_arg.open_flags = openflags;
+        opendata->o_arg.open_flags = 0;
+        opendata->o_arg.fmode = fmode;
        memset(&opendata->o_res, 0, sizeof(opendata->o_res));
        memset(&opendata->c_res, 0, sizeof(opendata->c_res));
        nfs4_init_opendata_res(opendata);
@@ -575,7 +685,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openf
        newstate = nfs4_opendata_to_nfs4_state(opendata);
        if (IS_ERR(newstate))
                return PTR_ERR(newstate);
-        nfs4_close_state(&opendata->path, newstate, openflags);
+        nfs4_close_state(&opendata->path, newstate, fmode);
        *res = newstate;
        return 0;
 }
@@ -631,7 +741,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 {
        struct nfs_delegation *delegation;
        struct nfs4_opendata *opendata;
-        int delegation_type = 0;
+        fmode_t delegation_type = 0;
        int status;
        opendata = nfs4_open_recoverdata_alloc(ctx, state);
@@ -641,7 +751,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
        opendata->o_arg.fh = NFS_FH(state->inode);
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(state->inode)->delegation);
-        if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0)
+        if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
                delegation_type = delegation->type;
        rcu_read_unlock();
        opendata->o_arg.u.delegation_type = delegation_type;
@@ -744,7 +854,7 @@ static void nfs4_open_confirm_release(void *calldata)
                goto out_free;
        state = nfs4_opendata_to_nfs4_state(data);
        if (!IS_ERR(state))
-                nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+                nfs4_close_state(&data->path, state, data->o_arg.fmode);
 out_free:
        nfs4_opendata_put(data);
 }
@@ -808,12 +918,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        if (data->state != NULL) {
                struct nfs_delegation *delegation;
-                if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL)))
+                if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
                        goto out_no_action;
                rcu_read_lock();
                delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
                if (delegation != NULL &&
-                   (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) {
+                    test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) {
                        rcu_read_unlock();
                        goto out_no_action;
                }
@@ -877,7 +987,7 @@ static void nfs4_open_release(void *calldata)
                goto out_free;
        state = nfs4_opendata_to_nfs4_state(data);
        if (!IS_ERR(state))
-                nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+                nfs4_close_state(&data->path, state, data->o_arg.fmode);
 out_free:
        nfs4_opendata_put(data);
 }
@@ -955,10 +1065,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
        int ret;
        for (;;) {
-                ret = nfs4_wait_clnt_recover(server->client, clp);
+                ret = nfs4_wait_clnt_recover(clp);
                if (ret != 0)
                        return ret;
-                if (!test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
                        break;
                nfs4_schedule_state_recovery(clp);
        }
@@ -993,8 +1104,9 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4
        do {
                err = _nfs4_open_expired(ctx, state);
-                if (err == -NFS4ERR_DELAY)
+                if (err != -NFS4ERR_DELAY)
-                        nfs4_handle_exception(server, err, &exception);
+                        break;
+                nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
        return err;
 }
@@ -1031,12 +1143,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 /*
 * Returns a referenced nfs4_state
 */
-static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
 {
        struct nfs4_state_owner  *sp;
        struct nfs4_state     *state = NULL;
        struct nfs_server       *server = NFS_SERVER(dir);
-        struct nfs_client *clp = server->nfs_client;
        struct nfs4_opendata *opendata;
        int status;
@@ -1050,12 +1161,11 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
        if (status != 0)
                goto err_put_state_owner;
        if (path->dentry->d_inode != NULL)
-                nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE));
+                nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
-        down_read(&clp->cl_sem);
        status = -ENOMEM;
-        opendata = nfs4_opendata_alloc(path, sp, flags, sattr);
+        opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
        if (opendata == NULL)
-                goto err_release_rwsem;
+                goto err_put_state_owner;
        if (path->dentry->d_inode != NULL)
                opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp);
@@ -1073,13 +1183,10 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
                goto err_opendata_put;
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
-        up_read(&clp->cl_sem);
        *res = state;
        return 0;
 err_opendata_put:
        nfs4_opendata_put(opendata);
-err_release_rwsem:
-        up_read(&clp->cl_sem);
 err_put_state_owner:
        nfs4_put_state_owner(sp);
 out_err:
@@ -1088,14 +1195,14 @@ out_err:
 }
-static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred)
+static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
 {
        struct nfs4_exception exception = { };
        struct nfs4_state *res;
        int status;
        do {
-                status = _nfs4_do_open(dir, path, flags, sattr, cred, &res);
+                status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res);
                if (status == 0)
                        break;
                /* NOTE: BAD_SEQID means the server and client disagree about the
@@ -1230,10 +1337,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                        renew_lease(server, calldata->timestamp);
                        break;
                case -NFS4ERR_STALE_STATEID:
+                case -NFS4ERR_OLD_STATEID:
+                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_EXPIRED:
-                        break;
+                        if (calldata->arg.fmode == 0)
+                                break;
                default:
-                        if (nfs4_async_handle_error(task, server) == -EAGAIN) {
+                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
                                rpc_restart_call(task);
                                return;
                        }
@@ -1272,10 +1382,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        nfs_fattr_init(calldata->res.fattr);
        if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-                calldata->arg.open_flags = FMODE_READ;
+                calldata->arg.fmode = FMODE_READ;
        } else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
-                calldata->arg.open_flags = FMODE_WRITE;
+                calldata->arg.fmode = FMODE_WRITE;
        }
        calldata->timestamp = jiffies;
        rpc_call_start(task);
@@ -1328,6 +1438,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
        if (calldata->arg.seqid == NULL)
                goto out_free_calldata;
+        calldata->arg.fmode = 0;
        calldata->arg.bitmask = server->attr_bitmask;
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
@@ -1354,13 +1465,13 @@ out:
        return status;
 }
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state)
+static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
        struct file *filp;
        int ret;
        /* If the open_intent is for execute, we have an extra check to make */
-        if (nd->intent.open.flags & FMODE_EXEC) {
+        if (fmode & FMODE_EXEC) {
                ret = nfs_may_open(state->inode,
                                state->owner->so_cred,
                                nd->intent.open.flags);
@@ -1376,7 +1487,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
        }
        ret = PTR_ERR(filp);
 out_close:
-        nfs4_close_sync(path, state, nd->intent.open.flags);
+        nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
        return ret;
 }
@@ -1392,6 +1503,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        struct rpc_cred *cred;
        struct nfs4_state *state;
        struct dentry *res;
+        fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
        if (nd->flags & LOOKUP_CREATE) {
                attr.ia_mode = nd->intent.open.create_mode;
@@ -1409,7 +1521,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
        parent = dentry->d_parent;
        /* Protect against concurrent sillydeletes */
        nfs_block_sillyrename(parent);
-        state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
+        state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
        put_rpccred(cred);
        if (IS_ERR(state)) {
                if (PTR_ERR(state) == -ENOENT) {
@@ -1424,7 +1536,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                path.dentry = res;
        nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
        nfs_unblock_sillyrename(parent);
-        nfs4_intent_set_file(nd, &path, state);
+        nfs4_intent_set_file(nd, &path, state, fmode);
        return res;
 }
@@ -1437,11 +1549,12 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
        };
        struct rpc_cred *cred;
        struct nfs4_state *state;
+        fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
        cred = rpc_lookup_cred();
        if (IS_ERR(cred))
                return PTR_ERR(cred);
-        state = nfs4_do_open(dir, &path, openflags, NULL, cred);
+        state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
        put_rpccred(cred);
        if (IS_ERR(state)) {
                switch (PTR_ERR(state)) {
@@ -1458,10 +1571,10 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
        }
        if (state->inode == dentry->d_inode) {
                nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-                nfs4_intent_set_file(nd, &path, state);
+                nfs4_intent_set_file(nd, &path, state, fmode);
                return 1;
        }
-        nfs4_close_sync(&path, state, openflags);
+        nfs4_close_sync(&path, state, fmode);
 out_drop:
        d_drop(dentry);
        return 0;
@@ -1887,6 +2000,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
        };
        struct nfs4_state *state;
        struct rpc_cred *cred;
+        fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
        int status = 0;
        cred = rpc_lookup_cred();
@@ -1894,7 +2008,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                status = PTR_ERR(cred);
                goto out;
        }
-        state = nfs4_do_open(dir, &path, flags, sattr, cred);
+        state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
        d_drop(dentry);
        if (IS_ERR(state)) {
                status = PTR_ERR(state);
@@ -1910,9 +2024,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
                nfs_post_op_update_inode(state->inode, &fattr);
        }
        if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
-                status = nfs4_intent_set_file(nd, &path, state);
+                status = nfs4_intent_set_file(nd, &path, state, fmode);
        else
-                nfs4_close_sync(&path, state, flags);
+                nfs4_close_sync(&path, state, fmode);
 out_putcred:
        put_rpccred(cred);
 out:
@@ -1974,7 +2088,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 {
        struct nfs_removeres *res = task->tk_msg.rpc_resp;
-        if (nfs4_async_handle_error(task, res->server) == -EAGAIN)
+        if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
                return 0;
        update_changeattr(dir, &res->cinfo);
        nfs_post_op_update_inode(dir, &res->dir_attr);
@@ -2402,7 +2516,7 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        if (nfs4_async_handle_error(task, server) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
                rpc_restart_call(task);
                return -EAGAIN;
        }
@@ -2423,7 +2537,7 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
                rpc_restart_call(task);
                return -EAGAIN;
        }
@@ -2449,7 +2563,7 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        struct inode *inode = data->inode;
        
-        if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
+        if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
                rpc_restart_call(task);
                return -EAGAIN;
        }
@@ -2742,19 +2856,25 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 }
 static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
 {
        struct nfs_client *clp = server->nfs_client;
        if (!clp || task->tk_status >= 0)
                return 0;
        switch(task->tk_status) {
+                case -NFS4ERR_ADMIN_REVOKED:
+                case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_OPENMODE:
+                        if (state == NULL)
+                                break;
+                        nfs4_state_mark_reclaim_nograce(clp, state);
                case -NFS4ERR_STALE_CLIENTID:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
                        rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
                        nfs4_schedule_state_recovery(clp);
-                        if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
+                        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
                                rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
                        task->tk_status = 0;
                        return -EAGAIN;
@@ -2772,79 +2892,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
        return 0;
 }
-static int nfs4_wait_bit_killable(void *word)
-{
-        if (fatal_signal_pending(current))
-                return -ERESTARTSYS;
-        schedule();
-        return 0;
-}
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
-{
-        int res;
-        might_sleep();
-        rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_);
-        res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
-                        nfs4_wait_bit_killable, TASK_KILLABLE);
-        rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_);
-        return res;
-}
-static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
-{
-        int res = 0;
-        might_sleep();
-        if (*timeout <= 0)
-                *timeout = NFS4_POLL_RETRY_MIN;
-        if (*timeout > NFS4_POLL_RETRY_MAX)
-                *timeout = NFS4_POLL_RETRY_MAX;
-        schedule_timeout_killable(*timeout);
-        if (fatal_signal_pending(current))
-                res = -ERESTARTSYS;
-        *timeout <<= 1;
-        return res;
-}
-/* This is the error handling routine for processes that are allowed
- * to sleep.
- */
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
-{
-        struct nfs_client *clp = server->nfs_client;
-        int ret = errorcode;
-        exception->retry = 0;
-        switch(errorcode) {
-                case 0:
-                        return 0;
-                case -NFS4ERR_STALE_CLIENTID:
-                case -NFS4ERR_STALE_STATEID:
-                case -NFS4ERR_EXPIRED:
-                        nfs4_schedule_state_recovery(clp);
-                        ret = nfs4_wait_clnt_recover(server->client, clp);
-                        if (ret == 0)
-                                exception->retry = 1;
-                        break;
-                case -NFS4ERR_FILE_OPEN:
-                case -NFS4ERR_GRACE:
-                case -NFS4ERR_DELAY:
-                        ret = nfs4_delay(server->client, &exception->timeout);
-                        if (ret != 0)
-                                break;
-                case -NFS4ERR_OLD_STATEID:
-                        exception->retry = 1;
-        }
-        /* We failed to handle the error */
-        return nfs4_map_errors(ret);
-}
 int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
 {
        nfs4_verifier sc_verifier;
@@ -2916,7 +2963,6 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
                spin_lock(&clp->cl_lock);
                clp->cl_lease_time = fsinfo.lease_time * HZ;
                clp->cl_last_renewal = now;
-                clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                spin_unlock(&clp->cl_lock);
        }
        return status;
@@ -3074,7 +3120,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
        struct nfs4_lock_state *lsp;
        int status;
-        down_read(&clp->cl_sem);
        arg.lock_owner.clientid = clp->cl_clientid;
        status = nfs4_set_lock_state(state, request);
        if (status != 0)
@@ -3091,7 +3136,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
        }
        request->fl_ops->fl_release_private(request);
 out:
-        up_read(&clp->cl_sem);
        return status;
 }
@@ -3181,11 +3225,13 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                                        sizeof(calldata->lsp->ls_stateid.data));
                        renew_lease(calldata->server, calldata->timestamp);
                        break;
+                case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
                        break;
                default:
-                        if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN)
+                        if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
                                rpc_restart_call(task);
        }
 }
@@ -3248,6 +3294,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
+        struct nfs_inode *nfsi = NFS_I(state->inode);
        struct nfs_seqid *seqid;
        struct nfs4_lock_state *lsp;
        struct rpc_task *task;
@@ -3257,8 +3304,12 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        status = nfs4_set_lock_state(state, request);
        /* Unlock _before_ we do the RPC call */
        request->fl_flags |= FL_EXISTS;
-        if (do_vfs_lock(request->fl_file, request) == -ENOENT)
+        down_read(&nfsi->rwsem);
+        if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
+                up_read(&nfsi->rwsem);
                goto out;
+        }
+        up_read(&nfsi->rwsem);
        if (status != 0)
                goto out;
        /* Is this a delegated lock? */
@@ -3484,7 +3535,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-        struct nfs_client *clp = state->owner->so_client;
+        struct nfs_inode *nfsi = NFS_I(state->inode);
        unsigned char fl_flags = request->fl_flags;
        int status;
@@ -3496,19 +3547,13 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        status = do_vfs_lock(request->fl_file, request);
        if (status < 0)
                goto out;
-        down_read(&clp->cl_sem);
+        down_read(&nfsi->rwsem);
        if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
-                struct nfs_inode *nfsi = NFS_I(state->inode);
                /* Yes: cache locks! */
-                down_read(&nfsi->rwsem);
                /* ...but avoid races with delegation recall... */
-                if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
+                request->fl_flags = fl_flags & ~FL_SLEEP;
-                        request->fl_flags = fl_flags & ~FL_SLEEP;
+                status = do_vfs_lock(request->fl_file, request);
-                        status = do_vfs_lock(request->fl_file, request);
+                goto out_unlock;
-                        up_read(&nfsi->rwsem);
-                        goto out_unlock;
-                }
-                up_read(&nfsi->rwsem);
        }
        status = _nfs4_do_setlk(state, cmd, request, 0);
        if (status != 0)
@@ -3518,7 +3563,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
        if (do_vfs_lock(request->fl_file, request) < 0)
                printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
 out_unlock:
-        up_read(&clp->cl_sem);
+        up_read(&nfsi->rwsem);
 out:
        request->fl_flags = fl_flags;
        return status;
@@ -3664,11 +3709,15 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 }
 struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+        .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
        .recover_open   = nfs4_open_reclaim,
        .recover_lock   = nfs4_lock_reclaim,
 };
-struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops = {
+struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops = {
+        .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+        .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
        .recover_open   = nfs4_open_expired,
        .recover_lock   = nfs4_lock_expired,
 };
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 3305acbbe2ae..f524e932ff7b 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -65,7 +65,6 @@ nfs4_renew_state(struct work_struct *work)
        long lease, timeout;
        unsigned long last, now;
-        down_read(&clp->cl_sem);
        dprintk("%s: start\n", __func__);
        /* Are there any active superblocks? */
        if (list_empty(&clp->cl_superblocks))
@@ -77,17 +76,19 @@ nfs4_renew_state(struct work_struct *work)
        timeout = (2 * lease) / 3 + (long)last - (long)now;
        /* Are we close to a lease timeout? */
        if (time_after(now, last + lease/3)) {
-                cred = nfs4_get_renew_cred(clp);
+                cred = nfs4_get_renew_cred_locked(clp);
+                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        if (list_empty(&clp->cl_delegations)) {
-                        spin_unlock(&clp->cl_lock);
+                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                                goto out;
+                        }
                        nfs_expire_all_delegations(clp);
-                        goto out;
+                } else {
+                        /* Queue an asynchronous RENEW. */
+                        nfs4_proc_async_renew(clp, cred);
+                        put_rpccred(cred);
                }
-                spin_unlock(&clp->cl_lock);
-                /* Queue an asynchronous RENEW. */
-                nfs4_proc_async_renew(clp, cred);
-                put_rpccred(cred);
                timeout = (2 * lease) / 3;
                spin_lock(&clp->cl_lock);
        } else
@@ -100,12 +101,11 @@ nfs4_renew_state(struct work_struct *work)
        cancel_delayed_work(&clp->cl_renewd);
        schedule_delayed_work(&clp->cl_renewd, timeout);
        spin_unlock(&clp->cl_lock);
+        nfs_expire_unreferenced_delegations(clp);
 out:
-        up_read(&clp->cl_sem);
        dprintk("%s: done\n", __func__);
 }
-/* Must be called with clp->cl_sem locked for writes */
 void
 nfs4_schedule_state_renewal(struct nfs_client *clp)
 {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 401ef8b28f97..2022fe47966f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -71,14 +71,12 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
        return status;
 }
-static struct rpc_cred *nfs4_get_machine_cred(struct nfs_client *clp)
+static struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
 {
        struct rpc_cred *cred = NULL;
-        spin_lock(&clp->cl_lock);
        if (clp->cl_machine_cred != NULL)
                cred = get_rpccred(clp->cl_machine_cred);
-        spin_unlock(&clp->cl_lock);
        return cred;
 }
@@ -94,7 +92,7 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
                put_rpccred(cred);
 }
-struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
@@ -110,13 +108,24 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
        return cred;
 }
+static struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+{
+        struct rpc_cred *cred;
+        spin_lock(&clp->cl_lock);
+        cred = nfs4_get_renew_cred_locked(clp);
+        spin_unlock(&clp->cl_lock);
+        return cred;
+}
 static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct rpc_cred *cred;
-        cred = nfs4_get_machine_cred(clp);
+        spin_lock(&clp->cl_lock);
+        cred = nfs4_get_machine_cred_locked(clp);
        if (cred != NULL)
                goto out;
        pos = rb_first(&clp->cl_state_owners);
@@ -125,6 +134,7 @@ static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
                cred = get_rpccred(sp->so_cred);
        }
 out:
+        spin_unlock(&clp->cl_lock);
        return cred;
 }
@@ -295,10 +305,6 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
        }
 }
-/*
- * Note: must be called with clp->cl_sem held in order to prevent races
- *       with reboot recovery!
- */
 struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
 {
        struct nfs_client *clp = server->nfs_client;
@@ -327,10 +333,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
        return sp;
 }
-/*
- * Must be called with clp->cl_sem held in order to avoid races
- * with state recovery...
- */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
        struct nfs_client *clp = sp->so_client;
@@ -361,18 +363,18 @@ nfs4_alloc_open_state(void)
 }
 void
-nfs4_state_set_mode_locked(struct nfs4_state *state, mode_t mode)
+nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
 {
-        if (state->state == mode)
+        if (state->state == fmode)
                return;
        /* NB! List reordering - see the reclaim code for why.  */
-        if ((mode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
+        if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
-                if (mode & FMODE_WRITE)
+                if (fmode & FMODE_WRITE)
                        list_move(&state->open_states, &state->owner->so_states);
                else
                        list_move_tail(&state->open_states, &state->owner->so_states);
        }
-        state->state = mode;
+        state->state = fmode;
 }
 static struct nfs4_state *
@@ -432,10 +434,6 @@ out:
        return state;
 }
-/*
- * Beware! Caller must be holding exactly one
- * reference to clp->cl_sem!
- */
 void nfs4_put_open_state(struct nfs4_state *state)
 {
        struct inode *inode = state->inode;
@@ -456,16 +454,16 @@ void nfs4_put_open_state(struct nfs4_state *state)
 /*
 * Close the current file.
 */
-static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
 {
        struct nfs4_state_owner *owner = state->owner;
        int call_close = 0;
-        int newstate;
+        fmode_t newstate;
        atomic_inc(&owner->so_count);
        /* Protect against nfs4_find_state() */
        spin_lock(&owner->so_lock);
-        switch (mode & (FMODE_READ | FMODE_WRITE)) {
+        switch (fmode & (FMODE_READ | FMODE_WRITE)) {
                case FMODE_READ:
                        state->n_rdonly--;
                        break;
@@ -500,14 +498,14 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mod
                nfs4_do_close(path, state, wait);
 }
-void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, mode, 0);
+        __nfs4_close(path, state, fmode, 0);
 }
-void nfs4_close_sync(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
 {
-        __nfs4_close(path, state, mode, 1);
+        __nfs4_close(path, state, fmode, 1);
 }
 /*
@@ -568,7 +566,6 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
 * Return a compatible lock_state. If no initialized lock_state structure
 * exists, return an uninitialized one.
 *
- * The caller must be holding clp->cl_sem
 */
 static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
@@ -770,32 +767,34 @@ unlock:
        return status;
 }
-static int reclaimer(void *);
+static int nfs4_run_state_manager(void *);
-static inline void nfs4_clear_recover_bit(struct nfs_client *clp)
+static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
 {
        smp_mb__before_clear_bit();
-        clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state);
+        clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
        smp_mb__after_clear_bit();
-        wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER);
+        wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
        rpc_wake_up(&clp->cl_rpcwaitq);
 }
 /*
- * State recovery routine
+ * Schedule the nfs_client asynchronous state management routine
 */
-static void nfs4_recover_state(struct nfs_client *clp)
+void nfs4_schedule_state_manager(struct nfs_client *clp)
 {
        struct task_struct *task;
+        if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+                return;
        __module_get(THIS_MODULE);
        atomic_inc(&clp->cl_count);
-        task = kthread_run(reclaimer, clp, "%s-reclaim",
+        task = kthread_run(nfs4_run_state_manager, clp, "%s-manager",
                                rpc_peeraddr2str(clp->cl_rpcclient,
                                                        RPC_DISPLAY_ADDR));
        if (!IS_ERR(task))
                return;
-        nfs4_clear_recover_bit(clp);
+        nfs4_clear_state_manager_bit(clp);
        nfs_put_client(clp);
        module_put(THIS_MODULE);
 }
@@ -807,16 +806,42 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
 {
        if (!clp)
                return;
-        if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
+        if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
-                nfs4_recover_state(clp);
+                set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+        nfs4_schedule_state_manager(clp);
 }
-static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+{
+        set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+        /* Don't recover state that expired before the reboot */
+        if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
+                clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+                return 0;
+        }
+        set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags);
+        set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+        return 1;
+}
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+{
+        set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+        clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+        set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
+        set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+        return 1;
+}
+static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
        struct inode *inode = state->inode;
+        struct nfs_inode *nfsi = NFS_I(inode);
        struct file_lock *fl;
        int status = 0;
+        down_write(&nfsi->rwsem);
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
                        continue;
@@ -839,12 +864,14 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
                                goto out_err;
                }
        }
+        up_write(&nfsi->rwsem);
        return 0;
 out_err:
+        up_write(&nfsi->rwsem);
        return status;
 }
-static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct nfs4_state_owner *sp)
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
 {
        struct nfs4_state *state;
        struct nfs4_lock_state *lock;
@@ -858,28 +885,34 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
         * recovering after a network partition or a reboot from a
         * server that doesn't support a grace period.
         */
+restart:
+        spin_lock(&sp->so_lock);
        list_for_each_entry(state, &sp->so_states, open_states) {
+                if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
+                        continue;
                if (state->state == 0)
                        continue;
+                atomic_inc(&state->count);
+                spin_unlock(&sp->so_lock);
                status = ops->recover_open(sp, state);
                if (status >= 0) {
-                        status = nfs4_reclaim_locks(ops, state);
+                        status = nfs4_reclaim_locks(state, ops);
-                        if (status < 0)
+                        if (status >= 0) {
-                                goto out_err;
+                                list_for_each_entry(lock, &state->lock_states, ls_locks) {
-                        list_for_each_entry(lock, &state->lock_states, ls_locks) {
+                                        if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
-                                if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
+                                                printk("%s: Lock reclaim failed!\n",
-                                        printk("%s: Lock reclaim failed!\n",
                                                        __func__);
+                                }
+                                nfs4_put_open_state(state);
+                                goto restart;
                        }
-                        continue;
                }
                switch (status) {
                        default:
                                printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
                                                __func__, status);
                        case -ENOENT:
-                        case -NFS4ERR_RECLAIM_BAD:
+                        case -ESTALE:
-                        case -NFS4ERR_RECLAIM_CONFLICT:
                                /*
                                 * Open state on this file cannot be recovered
                                 * All we can do is revert to using the zero stateid.
@@ -889,84 +922,176 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
                                /* Mark the file as being 'closed' */
                                state->state = 0;
                                break;
+                        case -NFS4ERR_RECLAIM_BAD:
+                        case -NFS4ERR_RECLAIM_CONFLICT:
+                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+                                break;
                        case -NFS4ERR_EXPIRED:
                        case -NFS4ERR_NO_GRACE:
+                                nfs4_state_mark_reclaim_nograce(sp->so_client, state);
                        case -NFS4ERR_STALE_CLIENTID:
                                goto out_err;
                }
+                nfs4_put_open_state(state);
+                goto restart;
        }
+        spin_unlock(&sp->so_lock);
        return 0;
 out_err:
+        nfs4_put_open_state(state);
        return status;
 }
-static void nfs4_state_mark_reclaim(struct nfs_client *clp)
+static void nfs4_clear_open_state(struct nfs4_state *state)
+{
+        struct nfs4_lock_state *lock;
+        clear_bit(NFS_DELEGATED_STATE, &state->flags);
+        clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+        clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+        clear_bit(NFS_O_RDWR_STATE, &state->flags);
+        list_for_each_entry(lock, &state->lock_states, ls_locks) {
+                lock->ls_seqid.flags = 0;
+                lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
+        }
+}
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
 {
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
        struct nfs4_state *state;
-        struct nfs4_lock_state *lock;
        /* Reset all sequence ids to zero */
        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-                sp->so_seqid.counter = 0;
                sp->so_seqid.flags = 0;
                spin_lock(&sp->so_lock);
                list_for_each_entry(state, &sp->so_states, open_states) {
-                        clear_bit(NFS_DELEGATED_STATE, &state->flags);
+                        if (mark_reclaim(clp, state))
-                        clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+                                nfs4_clear_open_state(state);
-                        clear_bit(NFS_O_WRONLY_STATE, &state->flags);
-                        clear_bit(NFS_O_RDWR_STATE, &state->flags);
-                        list_for_each_entry(lock, &state->lock_states, ls_locks) {
-                                lock->ls_seqid.counter = 0;
-                                lock->ls_seqid.flags = 0;
-                                lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
-                        }
                }
                spin_unlock(&sp->so_lock);
        }
 }
-static int reclaimer(void *ptr)
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
+{
+        /* Mark all delegations for reclaim */
+        nfs_delegation_mark_reclaim(clp);
+        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
+}
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 {
-        struct nfs_client *clp = ptr;
        struct nfs4_state_owner *sp;
        struct rb_node *pos;
-        struct nfs4_state_recovery_ops *ops;
+        struct nfs4_state *state;
-        struct rpc_cred *cred;
+        if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+                return;
+        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                spin_lock(&sp->so_lock);
+                list_for_each_entry(state, &sp->so_states, open_states) {
+                        if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+                                continue;
+                        nfs4_state_mark_reclaim_nograce(clp, state);
+                }
+                spin_unlock(&sp->so_lock);
+        }
+        nfs_delegation_reap_unclaimed(clp);
+}
+static void nfs_delegation_clear_all(struct nfs_client *clp)
+{
+        nfs_delegation_mark_reclaim(clp);
+        nfs_delegation_reap_unclaimed(clp);
+}
+static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
+{
+        nfs_delegation_clear_all(clp);
+        nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+}
+static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp)
+{
+        clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+}
+static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
+{
+        switch (error) {
+                case -NFS4ERR_CB_PATH_DOWN:
+                        nfs_handle_cb_pathdown(clp);
+                        break;
+                case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_LEASE_MOVED:
+                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        nfs4_state_start_reclaim_reboot(clp);
+                        break;
+                case -NFS4ERR_EXPIRED:
+                        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                        nfs4_state_start_reclaim_nograce(clp);
+        }
+}
+static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
+{
+        struct rb_node *pos;
        int status = 0;
-        allow_signal(SIGKILL);
+restart:
+        spin_lock(&clp->cl_lock);
+        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+                struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+                if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
+                        continue;
+                atomic_inc(&sp->so_count);
+                spin_unlock(&clp->cl_lock);
+                status = nfs4_reclaim_open_state(sp, ops);
+                if (status < 0) {
+                        set_bit(ops->owner_flag_bit, &sp->so_flags);
+                        nfs4_put_state_owner(sp);
+                        nfs4_recovery_handle_error(clp, status);
+                        return status;
+                }
+                nfs4_put_state_owner(sp);
+                goto restart;
+        }
+        spin_unlock(&clp->cl_lock);
+        return status;
+}
-        /* Ensure exclusive access to NFSv4 state */
+static int nfs4_check_lease(struct nfs_client *clp)
-        down_write(&clp->cl_sem);
+{
-        /* Are there any NFS mounts out there? */
+        struct rpc_cred *cred;
-        if (list_empty(&clp->cl_superblocks))
+        int status = -NFS4ERR_EXPIRED;
-                goto out;
-restart_loop:
+        /* Is the client already known to have an expired lease? */
-        ops = &nfs4_network_partition_recovery_ops;
+        if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
-        /* Are there any open files on this volume? */
+                return 0;
        cred = nfs4_get_renew_cred(clp);
-        if (cred != NULL) {
+        if (cred == NULL) {
-                /* Yes there are: try to renew the old lease */
+                cred = nfs4_get_setclientid_cred(clp);
-                status = nfs4_proc_renew(clp, cred);
+                if (cred == NULL)
-                put_rpccred(cred);
+                        goto out;
-                switch (status) {
-                        case 0:
-                        case -NFS4ERR_CB_PATH_DOWN:
-                                goto out;
-                        case -NFS4ERR_STALE_CLIENTID:
-                        case -NFS4ERR_LEASE_MOVED:
-                                ops = &nfs4_reboot_recovery_ops;
-                }
-        } else {
-                /* "reboot" to ensure we clear all state on the server */
-                clp->cl_boot_time = CURRENT_TIME;
        }
-        /* We're going to have to re-establish a clientid */
+        status = nfs4_proc_renew(clp, cred);
-        nfs4_state_mark_reclaim(clp);
+        put_rpccred(cred);
-        status = -ENOENT;
+out:
+        nfs4_recovery_handle_error(clp, status);
+        return status;
+}
+static int nfs4_reclaim_lease(struct nfs_client *clp)
+{
+        struct rpc_cred *cred;
+        int status = -ENOENT;
        cred = nfs4_get_setclientid_cred(clp);
        if (cred != NULL) {
                status = nfs4_init_client(clp, cred);
@@ -974,42 +1099,90 @@ restart_loop:
                /* Handle case where the user hasn't set up machine creds */
                if (status == -EACCES && cred == clp->cl_machine_cred) {
                        nfs4_clear_machine_cred(clp);
-                        goto restart_loop;
+                        status = -EAGAIN;
                }
        }
-        if (status)
+        return status;
-                goto out_error;
+}
-        /* Mark all delegations for reclaim */
-        nfs_delegation_mark_reclaim(clp);
+static void nfs4_state_manager(struct nfs_client *clp)
-        /* Note: list is protected by exclusive lock on cl->cl_sem */
+{
-        for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+        int status = 0;
-                sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
-                status = nfs4_reclaim_open_state(ops, sp);
+        /* Ensure exclusive access to NFSv4 state */
-                if (status < 0) {
+        for(;;) {
-                        if (status == -NFS4ERR_NO_GRACE) {
+                if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
-                                ops = &nfs4_network_partition_recovery_ops;
+                        /* We're going to have to re-establish a clientid */
-                                status = nfs4_reclaim_open_state(ops, sp);
+                        status = nfs4_reclaim_lease(clp);
+                        if (status) {
+                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+                                if (status == -EAGAIN)
+                                        continue;
+                                goto out_error;
                        }
+                        clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+                }
+                if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+                        status = nfs4_check_lease(clp);
+                        if (status != 0)
+                                continue;
+                }
+                /* First recover reboot state... */
+                if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+                        status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
                        if (status == -NFS4ERR_STALE_CLIENTID)
-                                goto restart_loop;
+                                continue;
-                        if (status == -NFS4ERR_EXPIRED)
+                        nfs4_state_end_reclaim_reboot(clp);
-                                goto restart_loop;
+                        continue;
+                }
+                /* Now recover expired state... */
+                if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
+                        status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops);
+                        if (status < 0) {
+                                set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+                                if (status == -NFS4ERR_STALE_CLIENTID)
+                                        continue;
+                                if (status == -NFS4ERR_EXPIRED)
+                                        continue;
+                                goto out_error;
+                        } else
+                                nfs4_state_end_reclaim_nograce(clp);
+                        continue;
                }
+                if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+                        nfs_client_return_marked_delegations(clp);
+                        continue;
+                }
+                nfs4_clear_state_manager_bit(clp);
+                /* Did we race with an attempt to give us more work? */
+                if (clp->cl_state == 0)
+                        break;
+                if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+                        break;
        }
-        nfs_delegation_reap_unclaimed(clp);
+        return;
-out:
+out_error:
-        up_write(&clp->cl_sem);
+        printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
-        if (status == -NFS4ERR_CB_PATH_DOWN)
+                        " with error %d\n", clp->cl_hostname, -status);
-                nfs_handle_cb_pathdown(clp);
+        if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
-        nfs4_clear_recover_bit(clp);
+                nfs4_state_end_reclaim_reboot(clp);
+        nfs4_clear_state_manager_bit(clp);
+}
+static int nfs4_run_state_manager(void *ptr)
+{
+        struct nfs_client *clp = ptr;
+        allow_signal(SIGKILL);
+        nfs4_state_manager(clp);
        nfs_put_client(clp);
        module_put_and_exit(0);
        return 0;
-out_error:
-        printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s"
-                        " with error %d\n", clp->cl_hostname, -status);
-        set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-        goto out;
 }
 /*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b916297d2334..d1e4c8f8a0a9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -8,7 +8,7 @@
 *
 *  Kendrick Smith <kmsmith@umich.edu>
 *  Andy Adamson   <andros@umich.edu>
- * 
+ *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
@@ -67,7 +67,7 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_MAXTAGLEN          0
 #endif
-/* lock,open owner id: 
+/* lock,open owner id:
 * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
 */
 #define open_owner_id_maxsz     (1 + 4)
@@ -541,6 +541,7 @@ static struct {
 struct compound_hdr {
        int32_t         status;
        uint32_t        nops;
+        __be32 *        nops_p;
        uint32_t        taglen;
        char *          tag;
 };
@@ -578,7 +579,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
        xdr_encode_opaque(p, str, len);
 }
-static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -588,8 +589,13 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
        WRITE32(hdr->taglen);
        WRITEMEM(hdr->tag, hdr->taglen);
        WRITE32(NFS4_MINOR_VERSION);
+        hdr->nops_p = p;
        WRITE32(hdr->nops);
-        return 0;
+}
+static void encode_nops(struct compound_hdr *hdr)
+{
+        *hdr->nops_p = htonl(hdr->nops);
 }
 static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
@@ -601,7 +607,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
        xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
 }
-static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
 {
        char owner_name[IDMAP_NAMESZ];
        char owner_group[IDMAP_NAMESZ];
@@ -612,7 +618,6 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        int len;
        uint32_t bmval0 = 0;
        uint32_t bmval1 = 0;
-        int status;
        /*
         * We reserve enough space to write the entire attribute buffer at once.
@@ -709,7 +714,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
                bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
                WRITE32(NFS4_SET_TO_SERVER_TIME);
        }
-        
        /*
         * Now we backfill the bitmap and the attribute buffer length.
         */
@@ -723,23 +728,20 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
        *q++ = htonl(bmval1);
        *q++ = htonl(len);
-        status = 0;
 /* out: */
-        return status;
 }
-static int encode_access(struct xdr_stream *xdr, u32 access)
+static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(8);
        WRITE32(OP_ACCESS);
        WRITE32(access);
-        
+        hdr->nops++;
-        return 0;
 }
-static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
+static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -747,26 +749,24 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
        WRITE32(OP_CLOSE);
        WRITE32(arg->seqid->sequence->counter);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-        
+        hdr->nops++;
-        return 0;
 }
-static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args)
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
-        
-        RESERVE_SPACE(16);
-        WRITE32(OP_COMMIT);
-        WRITE64(args->offset);
-        WRITE32(args->count);
-        return 0;
+        RESERVE_SPACE(16);
+        WRITE32(OP_COMMIT);
+        WRITE64(args->offset);
+        WRITE32(args->count);
+        hdr->nops++;
 }
-static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create)
+static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
 {
        __be32 *p;
-        
        RESERVE_SPACE(8);
        WRITE32(OP_CREATE);
        WRITE32(create->ftype);
@@ -791,64 +791,62 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
        RESERVE_SPACE(4 + create->name->len);
        WRITE32(create->name->len);
        WRITEMEM(create->name->name, create->name->len);
+        hdr->nops++;
-        return encode_attrs(xdr, create->attrs, create->server);
+        encode_attrs(xdr, create->attrs, create->server);
 }
-static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
+static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        RESERVE_SPACE(12);
+        RESERVE_SPACE(12);
-        WRITE32(OP_GETATTR);
+        WRITE32(OP_GETATTR);
-        WRITE32(1);
+        WRITE32(1);
-        WRITE32(bitmap);
+        WRITE32(bitmap);
-        return 0;
+        hdr->nops++;
 }
-static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1)
+static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        RESERVE_SPACE(16);
+        RESERVE_SPACE(16);
-        WRITE32(OP_GETATTR);
+        WRITE32(OP_GETATTR);
-        WRITE32(2);
+        WRITE32(2);
-        WRITE32(bm0);
+        WRITE32(bm0);
-        WRITE32(bm1);
+        WRITE32(bm1);
-        return 0;
+        hdr->nops++;
 }
-static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        return encode_getattr_two(xdr,
+        encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
-                        bitmask[0] & nfs4_fattr_bitmap[0],
+                           bitmask[1] & nfs4_fattr_bitmap[1], hdr);
-                        bitmask[1] & nfs4_fattr_bitmap[1]);
 }
-static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        return encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
+        encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
-                        bitmask[1] & nfs4_fsinfo_bitmap[1]);
+                           bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
 }
-static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
 {
-        return encode_getattr_two(xdr,
+        encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0],
-                                  bitmask[0] & nfs4_fs_locations_bitmap[0],
+                           bitmask[1] & nfs4_fs_locations_bitmap[1], hdr);
-                                  bitmask[1] & nfs4_fs_locations_bitmap[1]);
 }
-static int encode_getfh(struct xdr_stream *xdr)
+static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_GETFH);
+        hdr->nops++;
-        return 0;
 }
-static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -856,8 +854,7 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
        WRITE32(OP_LINK);
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
-        
+        hdr->nops++;
-        return 0;
 }
 static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -878,7 +875,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
 * opcode,type,reclaim,offset,length,new_lock_owner = 32
 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
 */
-static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
+static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -904,11 +901,10 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
                WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
                WRITE32(args->lock_seqid->sequence->counter);
        }
+        hdr->nops++;
-        return 0;
 }
-static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
+static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -921,11 +917,10 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
        WRITE32(16);
        WRITEMEM("lock id:", 8);
        WRITE64(args->lock_owner.id);
+        hdr->nops++;
-        return 0;
 }
-static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
+static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -936,11 +931,10 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
        WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
        WRITE64(args->fl->fl_start);
        WRITE64(nfs4_lock_length(args->fl));
+        hdr->nops++;
-        return 0;
 }
-static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        int len = name->len;
        __be32 *p;
@@ -949,27 +943,26 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
        WRITE32(OP_LOOKUP);
        WRITE32(len);
        WRITEMEM(name->name, len);
+        hdr->nops++;
-        return 0;
 }
-static void encode_share_access(struct xdr_stream *xdr, int open_flags)
+static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 {
        __be32 *p;
        RESERVE_SPACE(8);
-        switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
+        switch (fmode & (FMODE_READ|FMODE_WRITE)) {
-                case FMODE_READ:
+        case FMODE_READ:
-                        WRITE32(NFS4_SHARE_ACCESS_READ);
+                WRITE32(NFS4_SHARE_ACCESS_READ);
-                        break;
+                break;
-                case FMODE_WRITE:
+        case FMODE_WRITE:
-                        WRITE32(NFS4_SHARE_ACCESS_WRITE);
+                WRITE32(NFS4_SHARE_ACCESS_WRITE);
-                        break;
+                break;
-                case FMODE_READ|FMODE_WRITE:
+        case FMODE_READ|FMODE_WRITE:
-                        WRITE32(NFS4_SHARE_ACCESS_BOTH);
+                WRITE32(NFS4_SHARE_ACCESS_BOTH);
-                        break;
+                break;
-                default:
+        default:
-                        BUG();
+                WRITE32(0);
        }
        WRITE32(0);             /* for linux, share_deny = 0 always */
 }
@@ -984,7 +977,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
        RESERVE_SPACE(8);
        WRITE32(OP_OPEN);
        WRITE32(arg->seqid->sequence->counter);
-        encode_share_access(xdr, arg->open_flags);
+        encode_share_access(xdr, arg->fmode);
        RESERVE_SPACE(28);
        WRITE64(arg->clientid);
        WRITE32(16);
@@ -998,13 +991,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
        RESERVE_SPACE(4);
        switch(arg->open_flags & O_EXCL) {
-                case 0:
+        case 0:
-                        WRITE32(NFS4_CREATE_UNCHECKED);
+                WRITE32(NFS4_CREATE_UNCHECKED);
-                        encode_attrs(xdr, arg->u.attrs, arg->server);
+                encode_attrs(xdr, arg->u.attrs, arg->server);
-                        break;
+                break;
-                default:
+        default:
-                        WRITE32(NFS4_CREATE_EXCLUSIVE);
+                WRITE32(NFS4_CREATE_EXCLUSIVE);
-                        encode_nfs4_verifier(xdr, &arg->u.verifier);
+                encode_nfs4_verifier(xdr, &arg->u.verifier);
        }
 }
@@ -1014,33 +1007,33 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
        RESERVE_SPACE(4);
        switch (arg->open_flags & O_CREAT) {
-                case 0:
+        case 0:
-                        WRITE32(NFS4_OPEN_NOCREATE);
+                WRITE32(NFS4_OPEN_NOCREATE);
-                        break;
+                break;
-                default:
+        default:
-                        BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
+                BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-                        WRITE32(NFS4_OPEN_CREATE);
+                WRITE32(NFS4_OPEN_CREATE);
-                        encode_createmode(xdr, arg);
+                encode_createmode(xdr, arg);
        }
 }
-static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
+static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        switch (delegation_type) {
-                case 0:
+        case 0:
-                        WRITE32(NFS4_OPEN_DELEGATE_NONE);
+                WRITE32(NFS4_OPEN_DELEGATE_NONE);
-                        break;
+                break;
-                case FMODE_READ:
+        case FMODE_READ:
-                        WRITE32(NFS4_OPEN_DELEGATE_READ);
+                WRITE32(NFS4_OPEN_DELEGATE_READ);
-                        break;
+                break;
-                case FMODE_WRITE|FMODE_READ:
+        case FMODE_WRITE|FMODE_READ:
-                        WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+                WRITE32(NFS4_OPEN_DELEGATE_WRITE);
-                        break;
+                break;
-                default:
+        default:
-                        BUG();
+                BUG();
        }
 }
@@ -1053,7 +1046,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
        encode_string(xdr, name->len, name->name);
 }
-static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
+static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 {
        __be32 *p;
@@ -1072,27 +1065,27 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
        encode_string(xdr, name->len, name->name);
 }
-static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
 {
        encode_openhdr(xdr, arg);
        encode_opentype(xdr, arg);
        switch (arg->claim) {
-                case NFS4_OPEN_CLAIM_NULL:
+        case NFS4_OPEN_CLAIM_NULL:
-                        encode_claim_null(xdr, arg->name);
+                encode_claim_null(xdr, arg->name);
-                        break;
+                break;
-                case NFS4_OPEN_CLAIM_PREVIOUS:
+        case NFS4_OPEN_CLAIM_PREVIOUS:
-                        encode_claim_previous(xdr, arg->u.delegation_type);
+                encode_claim_previous(xdr, arg->u.delegation_type);
-                        break;
+                break;
-                case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+        case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-                        encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+                encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
-                        break;
+                break;
-                default:
+        default:
-                        BUG();
+                BUG();
        }
-        return 0;
+        hdr->nops++;
 }
-static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
+static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1100,11 +1093,10 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
        WRITE32(OP_OPEN_CONFIRM);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        WRITE32(arg->seqid->sequence->counter);
+        hdr->nops++;
-        return 0;
 }
-static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
+static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1112,12 +1104,12 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea
        WRITE32(OP_OPEN_DOWNGRADE);
        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
        WRITE32(arg->seqid->sequence->counter);
-        encode_share_access(xdr, arg->open_flags);
+        encode_share_access(xdr, arg->fmode);
-        return 0;
+        hdr->nops++;
 }
-static int
+static void
-encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
 {
        int len = fh->size;
        __be32 *p;
@@ -1126,18 +1118,16 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
        WRITE32(OP_PUTFH);
        WRITE32(len);
        WRITEMEM(fh->data, len);
+        hdr->nops++;
-        return 0;
 }
-static int encode_putrootfh(struct xdr_stream *xdr)
+static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        
-        RESERVE_SPACE(4);
-        WRITE32(OP_PUTROOTFH);
-        return 0;
+        RESERVE_SPACE(4);
+        WRITE32(OP_PUTROOTFH);
+        hdr->nops++;
 }
 static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
@@ -1153,7 +1143,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
                WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
 }
-static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
+static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1165,11 +1155,10 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
        RESERVE_SPACE(12);
        WRITE64(args->offset);
        WRITE32(args->count);
+        hdr->nops++;
-        return 0;
 }
-static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req)
+static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
        uint32_t attrs[2] = {
                FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
@@ -1191,6 +1180,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
        WRITE32(attrs[0] & readdir->bitmask[0]);
        WRITE32(attrs[1] & readdir->bitmask[1]);
+        hdr->nops++;
        dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
                        __func__,
                        (unsigned long long)readdir->cookie,
@@ -1198,21 +1188,18 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
                        ((u32 *)readdir->verifier.data)[1],
                        attrs[0] & readdir->bitmask[0],
                        attrs[1] & readdir->bitmask[1]);
-        return 0;
 }
-static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req)
+static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_READLINK);
+        hdr->nops++;
-        return 0;
 }
-static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1220,11 +1207,10 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
        WRITE32(OP_REMOVE);
        WRITE32(name->len);
        WRITEMEM(name->name, name->len);
+        hdr->nops++;
-        return 0;
 }
-static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname)
+static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1232,38 +1218,35 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
        WRITE32(OP_RENAME);
        WRITE32(oldname->len);
        WRITEMEM(oldname->name, oldname->len);
-        
        RESERVE_SPACE(4 + newname->len);
        WRITE32(newname->len);
        WRITEMEM(newname->name, newname->len);
+        hdr->nops++;
-        return 0;
 }
-static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
+static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(12);
        WRITE32(OP_RENEW);
        WRITE64(client_stateid->cl_clientid);
+        hdr->nops++;
-        return 0;
 }
-static int
+static void
-encode_restorefh(struct xdr_stream *xdr)
+encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_RESTOREFH);
+        hdr->nops++;
-        return 0;
 }
 static int
-encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
+encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1278,36 +1261,32 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
        RESERVE_SPACE(4);
        WRITE32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+        hdr->nops++;
        return 0;
 }
-static int
+static void
-encode_savefh(struct xdr_stream *xdr)
+encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
        RESERVE_SPACE(4);
        WRITE32(OP_SAVEFH);
+        hdr->nops++;
-        return 0;
 }
-static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server)
+static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
 {
-        int status;
        __be32 *p;
-        
-        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-        WRITE32(OP_SETATTR);
-        WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
-        if ((status = encode_attrs(xdr, arg->iap, server)))
+        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
-                return status;
+        WRITE32(OP_SETATTR);
+        WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
-        return 0;
+        hdr->nops++;
+        encode_attrs(xdr, arg->iap, server);
 }
-static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
+static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1322,23 +1301,21 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
        encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
        RESERVE_SPACE(4);
        WRITE32(setclientid->sc_cb_ident);
+        hdr->nops++;
-        return 0;
 }
-static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
 {
-        __be32 *p;
+        __be32 *p;
-        RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
-        WRITE32(OP_SETCLIENTID_CONFIRM);
-        WRITE64(client_state->cl_clientid);
-        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
-        return 0;
+        RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
+        WRITE32(OP_SETCLIENTID_CONFIRM);
+        WRITE64(client_state->cl_clientid);
+        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+        hdr->nops++;
 }
-static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args)
+static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1353,11 +1330,10 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args
        WRITE32(args->count);
        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
+        hdr->nops++;
-        return 0;
 }
-static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
 {
        __be32 *p;
@@ -1365,8 +1341,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
        WRITE32(OP_DELEGRETURN);
        WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
-        return 0;
+        hdr->nops++;
 }
 /*
 * END OF "GENERIC" ENCODE ROUTINES.
@@ -1379,21 +1354,16 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status != 0)
+        encode_access(&xdr, args->access, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_access(&xdr, args->access);
+        encode_nops(&hdr);
-        if (status != 0)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1403,21 +1373,17 @@ static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 4,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_lookup(&xdr, args->name, &hdr);
-        if ((status = encode_lookup(&xdr, args->name)) != 0)
+        encode_getfh(&xdr, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        if ((status = encode_getfh(&xdr)) != 0)
+        encode_nops(&hdr);
-                goto out;
+        return 0;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1427,18 +1393,16 @@ static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putrootfh(&xdr)) != 0)
+        encode_putrootfh(&xdr, &hdr);
-                goto out;
+        encode_getfh(&xdr, &hdr);
-        if ((status = encode_getfh(&xdr)) == 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                status = encode_getfattr(&xdr, args->bitmask);
+        encode_nops(&hdr);
-out:
+        return 0;
-        return status;
 }
 /*
@@ -1448,19 +1412,16 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->fh)) != 0)
+        encode_putfh(&xdr, args->fh, &hdr);
-                goto out;
+        encode_remove(&xdr, &args->name, &hdr);
-        if ((status = encode_remove(&xdr, &args->name)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_getfattr(&xdr, args->bitmask);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1470,27 +1431,20 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->old_dir)) != 0)
+        encode_putfh(&xdr, args->old_dir, &hdr);
-                goto out;
+        encode_savefh(&xdr, &hdr);
-        if ((status = encode_savefh(&xdr)) != 0)
+        encode_putfh(&xdr, args->new_dir, &hdr);
-                goto out;
+        encode_rename(&xdr, args->old_name, args->new_name, &hdr);
-        if ((status = encode_putfh(&xdr, args->new_dir)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+        return 0;
-                goto out;
-        if ((status = encode_restorefh(&xdr)) != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1500,27 +1454,20 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->fh)) != 0)
+        encode_putfh(&xdr, args->fh, &hdr);
-                goto out;
+        encode_savefh(&xdr, &hdr);
-        if ((status = encode_savefh(&xdr)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_link(&xdr, args->name, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        if ((status = encode_link(&xdr, args->name)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+        return 0;
-                goto out;
-        if ((status = encode_restorefh(&xdr)) != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1530,27 +1477,20 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_savefh(&xdr, &hdr);
-        if ((status = encode_savefh(&xdr)) != 0)
+        encode_create(&xdr, args, &hdr);
-                goto out;
+        encode_getfh(&xdr, &hdr);
-        if ((status = encode_create(&xdr, args)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        if ((status = encode_getfh(&xdr)) != 0)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
+        return 0;
-                goto out;
-        if ((status = encode_restorefh(&xdr)) != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1568,15 +1508,15 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->fh)) == 0)
+        encode_putfh(&xdr, args->fh, &hdr);
-                status = encode_getfattr(&xdr, args->bitmask);
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        return status;
+        encode_nops(&hdr);
+        return 0;
 }
 /*
@@ -1584,23 +1524,18 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
 */
 static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr = {
+        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
-        };
+        };
-        int status;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, &hdr);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_putfh(&xdr, args->fh, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_close(&xdr, args, &hdr);
-        if(status)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_close(&xdr, args);
+        return 0;
-        if (status != 0)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1610,33 +1545,20 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openarg
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 7,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_savefh(&xdr, &hdr);
-                goto out;
+        encode_open(&xdr, args, &hdr);
-        status = encode_savefh(&xdr);
+        encode_getfh(&xdr, &hdr);
-        if (status)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_restorefh(&xdr, &hdr);
-        status = encode_open(&xdr, args);
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        if (status)
+        encode_nops(&hdr);
-                goto out;
+        return 0;
-        status = encode_getfh(&xdr);
-        if (status)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-        if (status)
-                goto out;
-        status = encode_restorefh(&xdr);
-        if (status)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1646,18 +1568,15 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_open_confirm(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_open_confirm(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1667,21 +1586,16 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_open(&xdr, args, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_open(&xdr, args);
+        encode_nops(&hdr);
-        if (status)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1691,21 +1605,16 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_open_downgrade(&xdr, args, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_open_downgrade(&xdr, args);
+        encode_nops(&hdr);
-        if (status != 0)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1715,18 +1624,15 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_ar
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_lock(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_lock(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1736,18 +1642,15 @@ static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_lockt(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_lockt(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1757,18 +1660,15 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_locku(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_locku(&xdr, args);
+        return 0;
-out:
-        return status;
 }
 /*
@@ -1778,18 +1678,15 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        unsigned int replen;
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_readlink(&xdr, args, req, &hdr);
-                goto out;
-        status = encode_readlink(&xdr, args, req);
        /* set up reply kvec
         *    toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1798,9 +1695,8 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
                        args->pgbase, args->pglen);
+        encode_nops(&hdr);
-out:
+        return 0;
-        return status;
 }
 /*
@@ -1810,18 +1706,15 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        int replen;
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if(status)
+        encode_readdir(&xdr, args, req, &hdr);
-                goto out;
-        status = encode_readdir(&xdr, args, req);
        /* set up reply kvec
         *    toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1833,9 +1726,8 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
        dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
                        __func__, replen, args->pages,
                        args->pgbase, args->count);
+        encode_nops(&hdr);
-out:
+        return 0;
-        return status;
 }
 /*
@@ -1846,18 +1738,14 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int replen, status;
+        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_read(&xdr, args, &hdr);
-                goto out;
-        status = encode_read(&xdr, args);
-        if (status)
-                goto out;
        /* set up reply kvec
         *    toplevel status + taglen=0 + rescount + OP_PUTFH + status
@@ -1867,33 +1755,27 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
        xdr_inline_pages(&req->rq_rcv_buf, replen,
                         args->pages, args->pgbase, args->count);
        req->rq_rcv_buf.flags |= XDRBUF_READ;
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
 * Encode an SETATTR request
 */
 static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr = {
+        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
-        };
+        };
-        int status;
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, &hdr);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_putfh(&xdr, args->fh, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_setattr(&xdr, args, args->server, &hdr);
-        if(status)
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_setattr(&xdr, args, args->server);
+        return 0;
-        if(status)
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1906,22 +1788,21 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
        struct xdr_stream xdr;
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int replen, status;
+        int replen;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
-                goto out;
-        status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
        /* set up reply buffer: */
        replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen,
                args->acl_pages, args->acl_pgbase, args->acl_len);
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -1931,22 +1812,17 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_write(&xdr, args, &hdr);
-                goto out;
-        status = encode_write(&xdr, args);
-        if (status)
-                goto out;
        req->rq_snd_buf.flags |= XDRBUF_WRITE;
-        status = encode_getfattr(&xdr, args->bitmask);
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -1956,21 +1832,16 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_write
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        encode_commit(&xdr, args, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_commit(&xdr, args);
+        encode_nops(&hdr);
-        if (status)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -1980,16 +1851,15 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsin
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (!status)
+        encode_fsinfo(&xdr, args->bitmask, &hdr);
-                status = encode_fsinfo(&xdr, args->bitmask);
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -1999,17 +1869,16 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct n
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (!status)
+        encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
-                status = encode_getattr_one(&xdr,
+                           &hdr);
-                                args->bitmask[0] & nfs4_pathconf_bitmap[0]);
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -2019,18 +1888,16 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status == 0)
+        encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
-                status = encode_getattr_two(&xdr,
+                           args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
-                                args->bitmask[0] & nfs4_statfs_bitmap[0],
+        encode_nops(&hdr);
-                                args->bitmask[1] & nfs4_statfs_bitmap[1]);
+        return 0;
-        return status;
 }
 /*
@@ -2040,19 +1907,18 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 2,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, fhandle);
+        encode_putfh(&xdr, fhandle, &hdr);
-        if (status == 0)
+        encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
-                status = encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+                           FATTR4_WORD0_LINK_SUPPORT|
-                                FATTR4_WORD0_LINK_SUPPORT|
+                           FATTR4_WORD0_SYMLINK_SUPPORT|
-                                FATTR4_WORD0_SYMLINK_SUPPORT|
+                           FATTR4_WORD0_ACLSUPPORT, &hdr);
-                                FATTR4_WORD0_ACLSUPPORT);
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -2062,12 +1928,14 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 1,
+                .nops   = 0,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        return encode_renew(&xdr, clp);
+        encode_renew(&xdr, clp, &hdr);
+        encode_nops(&hdr);
+        return 0;
 }
 /*
@@ -2077,12 +1945,14 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 1,
+                .nops   = 0,
        };
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        return encode_setclientid(&xdr, sc);
+        encode_setclientid(&xdr, sc, &hdr);
+        encode_nops(&hdr);
+        return 0;
 }
 /*
@@ -2092,19 +1962,17 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops   = 3,
+                .nops   = 0,
        };
        const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_setclientid_confirm(&xdr, clp);
+        encode_setclientid_confirm(&xdr, clp, &hdr);
-        if (!status)
+        encode_putrootfh(&xdr, &hdr);
-                status = encode_putrootfh(&xdr);
+        encode_fsinfo(&xdr, lease_bitmap, &hdr);
-        if (!status)
+        encode_nops(&hdr);
-                status = encode_fsinfo(&xdr, lease_bitmap);
+        return 0;
-        return status;
 }
 /*
@@ -2114,21 +1982,16 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struc
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fhandle);
+        encode_putfh(&xdr, args->fhandle, &hdr);
-        if (status != 0)
+        encode_delegreturn(&xdr, args->stateid, &hdr);
-                goto out;
+        encode_getfattr(&xdr, args->bitmask, &hdr);
-        status = encode_delegreturn(&xdr, args->stateid);
+        encode_nops(&hdr);
-        if (status != 0)
+        return 0;
-                goto out;
-        status = encode_getfattr(&xdr, args->bitmask);
-out:
-        return status;
 }
 /*
@@ -2138,20 +2001,17 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
 {
        struct xdr_stream xdr;
        struct compound_hdr hdr = {
-                .nops = 3,
+                .nops = 0,
        };
        struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
        int replen;
-        int status;
        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
        encode_compound_hdr(&xdr, &hdr);
-        if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
+        encode_putfh(&xdr, args->dir_fh, &hdr);
-                goto out;
+        encode_lookup(&xdr, args->name, &hdr);
-        if ((status = encode_lookup(&xdr, args->name)) != 0)
+        encode_fs_locations(&xdr, args->bitmask, &hdr);
-                goto out;
-        if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
-                goto out;
        /* set up reply
         *   toplevel_status + OP_PUTFH + status
         *   + OP_LOOKUP + status + OP_GETATTR + status = 7
@@ -2159,8 +2019,8 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
        replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
        xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
                        0, PAGE_SIZE);
-out:
+        encode_nops(&hdr);
-        return status;
+        return 0;
 }
 /*
@@ -2217,11 +2077,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
        READ_BUF(8);
        READ32(hdr->status);
        READ32(hdr->taglen);
-        
        READ_BUF(hdr->taglen + 4);
        hdr->tag = (char *)p;
        p += XDR_QUADLEN(hdr->taglen);
        READ32(hdr->nops);
+        if (unlikely(hdr->nops < 1))
+                return nfs4_stat_to_errno(hdr->status);
        return 0;
 }
@@ -3047,8 +2909,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
 {
        __be32 *savep;
-        uint32_t attrlen, 
+        uint32_t attrlen, bitmap[2] = {0};
-                 bitmap[2] = {0};
        int status;
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3070,14 +2931,13 @@ xdr_error:
        dprintk("%s: xdr returned %d!\n", __func__, -status);
        return status;
 }
-        
 static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 {
        __be32 *savep;
-        uint32_t attrlen, 
+        uint32_t attrlen, bitmap[2] = {0};
-                 bitmap[2] = {0};
        int status;
-        
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto xdr_error;
        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3107,10 +2967,9 @@ xdr_error:
 static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
 {
        __be32 *savep;
-        uint32_t attrlen, 
+        uint32_t attrlen, bitmap[2] = {0};
-                 bitmap[2] = {0};
        int status;
-        
        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
                goto xdr_error;
        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3256,7 +3115,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 {
        int status;
-        
        status = decode_op_hdr(xdr, OP_LINK);
        if (status)
                return status;
@@ -3344,27 +3203,27 @@ static int decode_lookup(struct xdr_stream *xdr)
 /* This is too sick! */
 static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 {
-        __be32 *p;
+        __be32 *p;
        uint32_t limit_type, nblocks, blocksize;
        READ_BUF(12);
        READ32(limit_type);
        switch (limit_type) {
-                case 1:
+        case 1:
-                        READ64(*maxsize);
+                READ64(*maxsize);
-                        break;
+                break;
-                case 2:
+        case 2:
-                        READ32(nblocks);
+                READ32(nblocks);
-                        READ32(blocksize);
+                READ32(blocksize);
-                        *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+                *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
        }
        return 0;
 }
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        __be32 *p;
+        __be32 *p;
-        uint32_t delegation_type;
+        uint32_t delegation_type;
        READ_BUF(4);
        READ32(delegation_type);
@@ -3375,13 +3234,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
        READ_BUF(NFS4_STATEID_SIZE+4);
        COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
        READ32(res->do_recall);
        switch (delegation_type) {
-                case NFS4_OPEN_DELEGATE_READ:
+        case NFS4_OPEN_DELEGATE_READ:
-                        res->delegation_type = FMODE_READ;
+                res->delegation_type = FMODE_READ;
-                        break;
+                break;
-                case NFS4_OPEN_DELEGATE_WRITE:
+        case NFS4_OPEN_DELEGATE_WRITE:
-                        res->delegation_type = FMODE_WRITE|FMODE_READ;
+                res->delegation_type = FMODE_WRITE|FMODE_READ;
-                        if (decode_space_limit(xdr, &res->maxsize) < 0)
+                if (decode_space_limit(xdr, &res->maxsize) < 0)
                                return -EIO;
        }
        return decode_ace(xdr, NULL, res->server->nfs_client);
@@ -3389,27 +3249,27 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        __be32 *p;
+        __be32 *p;
        uint32_t savewords, bmlen, i;
-        int status;
+        int status;
-        status = decode_op_hdr(xdr, OP_OPEN);
+        status = decode_op_hdr(xdr, OP_OPEN);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (status)
-                return status;
+                return status;
-        READ_BUF(NFS4_STATEID_SIZE);
+        READ_BUF(NFS4_STATEID_SIZE);
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-        decode_change_info(xdr, &res->cinfo);
+        decode_change_info(xdr, &res->cinfo);
-        READ_BUF(8);
+        READ_BUF(8);
-        READ32(res->rflags);
+        READ32(res->rflags);
-        READ32(bmlen);
+        READ32(bmlen);
-        if (bmlen > 10)
+        if (bmlen > 10)
-                goto xdr_error;
+                goto xdr_error;
-        READ_BUF(bmlen << 2);
+        READ_BUF(bmlen << 2);
        savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
        for (i = 0; i < savewords; ++i)
                READ32(res->attrset[i]);
@@ -3424,17 +3284,17 @@ xdr_error:
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
 {
-        __be32 *p;
+        __be32 *p;
        int status;
-        status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
+        status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (status)
-                return status;
+                return status;
-        READ_BUF(NFS4_STATEID_SIZE);
+        READ_BUF(NFS4_STATEID_SIZE);
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-        return 0;
+        return 0;
 }
 static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
@@ -3562,7 +3422,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
                dprintk("NFS: readdir reply truncated!\n");
                entry[1] = 1;
        }
-out:    
+out:
        kunmap_atomic(kaddr, KM_USER0);
        return 0;
 short_pkt:
@@ -3718,7 +3578,6 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
        uint32_t bmlen;
        int status;
-        
        status = decode_op_hdr(xdr, OP_SETATTR);
        if (status)
                return status;
@@ -3738,7 +3597,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
        READ32(opnum);
        if (opnum != OP_SETCLIENTID) {
                dprintk("nfs: decode_setclientid: Server returned operation"
-                                " %d\n", opnum);
+                        " %d\n", opnum);
                return -EIO;
        }
        READ32(nfserr);
@@ -3792,34 +3651,34 @@ static int decode_delegreturn(struct xdr_stream *xdr)
 }
 /*
+ * END OF "GENERIC" DECODE ROUTINES.
+ */
+/*
 * Decode OPEN_DOWNGRADE response
 */
 static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_open_downgrade(&xdr, res);
+        status = decode_open_downgrade(&xdr, res);
        if (status != 0)
                goto out;
        decode_getfattr(&xdr, res->fattr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
- * END OF "GENERIC" DECODE ROUTINES.
- */
-/*
 * Decode ACCESS response
 */
 static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
@@ -3827,7 +3686,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3850,7 +3709,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3873,7 +3732,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3893,7 +3752,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3914,7 +3773,7 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3944,7 +3803,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -3977,7 +3836,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
                goto out;
@@ -4014,7 +3873,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
        struct xdr_stream xdr;
        struct compound_hdr hdr;
        int status;
-        
        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
        status = decode_compound_hdr(&xdr, &hdr);
        if (status)
@@ -4025,7 +3884,6 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
        status = decode_getfattr(&xdr, res->fattr, res->server);
 out:
        return status;
 }
 /*
@@ -4034,21 +3892,20 @@ out:
 static int
 nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr = {
+        struct compound_hdr hdr = {
-                .nops   = 2,
+                .nops   = 0,
-        };
+        };
-        int status;
+        int status;
-        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
-        encode_compound_hdr(&xdr, &hdr);
+        encode_compound_hdr(&xdr, &hdr);
-        status = encode_putfh(&xdr, args->fh);
+        encode_putfh(&xdr, args->fh, &hdr);
-        if (status)
+        status = encode_setacl(&xdr, args, &hdr);
-                goto out;
+        encode_nops(&hdr);
-        status = encode_setacl(&xdr, args);
+        return status;
-out:
-        return status;
 }
 /*
 * Decode SETACL response
 */
@@ -4099,18 +3956,18 @@ out:
 */
 static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_close(&xdr, res);
+        status = decode_close(&xdr, res);
        if (status != 0)
                goto out;
        /*
@@ -4121,7 +3978,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
         */
        decode_getfattr(&xdr, res->fattr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4129,23 +3986,23 @@ out:
 */
 static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_savefh(&xdr);
+        status = decode_savefh(&xdr);
+        if (status)
+                goto out;
+        status = decode_open(&xdr, res);
        if (status)
                goto out;
-        status = decode_open(&xdr, res);
-        if (status)
-                goto out;
        if (decode_getfh(&xdr, &res->fh) != 0)
                goto out;
        if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
@@ -4154,7 +4011,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
                goto out;
        decode_getfattr(&xdr, res->dir_attr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4162,20 +4019,20 @@ out:
 */
 static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_open_confirm(&xdr, res);
+        status = decode_open_confirm(&xdr, res);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4183,23 +4040,23 @@ out:
 */
 static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_open(&xdr, res);
+        status = decode_open(&xdr, res);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
        decode_getfattr(&xdr, res->f_attr, res->server);
 out:
-        return status;
+        return status;
 }
 /*
@@ -4207,25 +4064,25 @@ out:
 */
 static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
 {
-        struct xdr_stream xdr;
+        struct xdr_stream xdr;
-        struct compound_hdr hdr;
+        struct compound_hdr hdr;
-        int status;
+        int status;
-        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+        xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
-        status = decode_compound_hdr(&xdr, &hdr);
+        status = decode_compound_hdr(&xdr, &hdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_putfh(&xdr);
+        status = decode_putfh(&xdr);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
-        status = decode_setattr(&xdr, res);
+        status = decode_setattr(&xdr, res);
-        if (status)
+        if (status)
-                goto out;
+                goto out;
        status = decode_getfattr(&xdr, res->fattr, res->server);
        if (status == NFS4ERR_DELAY)
                status = 0;
 out:
-        return status;
+        return status;
 }
 /*
@@ -4421,8 +4278,6 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
                status = decode_putfh(&xdr);
        if (!status)
                status = decode_fsinfo(&xdr, fsinfo);
-        if (!status)
-                status = nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4511,8 +4366,6 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
        status = decode_compound_hdr(&xdr, &hdr);
        if (!status)
                status = decode_setclientid(&xdr, clp);
-        if (!status)
-                status = nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4533,8 +4386,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
                status = decode_putrootfh(&xdr);
        if (!status)
                status = decode_fsinfo(&xdr, fsinfo);
-        if (!status)
-                status = nfs4_stat_to_errno(hdr.status);
        return status;
 }
@@ -4715,7 +4566,7 @@ nfs4_stat_to_errno(int stat)
        .p_replen = NFS4_##restype##_sz,                        \
        .p_statidx = NFSPROC4_CLNT_##proc,                      \
        .p_name   = #proc,                                      \
-    }
+}
 struct rpc_procinfo     nfs4_procedures[] = {
  PROC(READ,            enc_read,       dec_read),
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index d74d16ce0d49..d9ef602fbc5a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,6 +86,8 @@
 #include <net/ipconfig.h>
 #include <linux/parser.h>
+#include "internal.h"
 /* Define this to allow debugging output */
 #undef NFSROOT_DEBUG
 #define NFSDBG_FACILITY NFSDBG_ROOT
@@ -100,7 +102,7 @@ static char nfs_root_name[256] __initdata = "";
 static __be32 servaddr __initdata = 0;
 /* Name of directory to mount */
-static char nfs_path[NFS_MAXPATHLEN] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, };
 /* NFS-related data */
 static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
@@ -312,7 +314,7 @@ static int __init root_nfs_name(char *name)
                printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
                return -1;
        }
-        sprintf(nfs_path, buf, cp);
+        sprintf(nfs_export_path, buf, cp);
        return 1;
 }
@@ -340,7 +342,7 @@ static int __init root_nfs_addr(void)
 static void __init root_nfs_print(void)
 {
        printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
-                nfs_path, nfs_data.hostname);
+                nfs_export_path, nfs_data.hostname);
        printk(KERN_NOTICE "Root-NFS:     rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
                nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
        printk(KERN_NOTICE "Root-NFS:     acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
@@ -485,18 +487,23 @@ static int __init root_nfs_get_handle(void)
 {
        struct nfs_fh fh;
        struct sockaddr_in sin;
+        struct nfs_mount_request request = {
+                .sap            = (struct sockaddr *)&sin,
+                .salen          = sizeof(sin),
+                .dirpath        = nfs_export_path,
+                .version        = (nfs_data.flags & NFS_MOUNT_VER3) ?
+                                        NFS_MNT3_VERSION : NFS_MNT_VERSION,
+                .protocol       = (nfs_data.flags & NFS_MOUNT_TCP) ?
+                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
+                .fh             = &fh,
+        };
        int status;
-        int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
-                                        XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP;
-        int version = (nfs_data.flags & NFS_MOUNT_VER3) ?
-                                        NFS_MNT3_VERSION : NFS_MNT_VERSION;
        set_sockaddr(&sin, servaddr, htons(mount_port));
-        status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL,
+        status = nfs_mount(&request);
-                           nfs_path, version, protocol, &fh);
        if (status < 0)
                printk(KERN_ERR "Root-NFS: Server returned error %d "
-                                "while mounting %s\n", status, nfs_path);
+                                "while mounting %s\n", status, nfs_export_path);
        else {
                nfs_data.root.size = fh.size;
                memcpy(nfs_data.root.data, fh.data, fh.size);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 40d17987d0e8..f856004bb7fa 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -533,12 +533,6 @@ readpage_async_filler(void *data, struct page *page)
        unsigned int len;
        int error;
-        error = nfs_wb_page(inode, page);
-        if (error)
-                goto out_unlock;
-        if (PageUptodate(page))
-                goto out_unlock;
        len = nfs_page_length(page);
        if (len == 0)
                return nfs_return_empty_page(page);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index bb0313ac9e1f..d6686f4786dc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -75,6 +75,7 @@ enum {
        Opt_acl, Opt_noacl,
        Opt_rdirplus, Opt_nordirplus,
        Opt_sharecache, Opt_nosharecache,
+        Opt_resvport, Opt_noresvport,
        /* Mount options that take integer arguments */
        Opt_port,
@@ -129,6 +130,8 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_nordirplus, "nordirplus" },
        { Opt_sharecache, "sharecache" },
        { Opt_nosharecache, "nosharecache" },
+        { Opt_resvport, "resvport" },
+        { Opt_noresvport, "noresvport" },
        { Opt_port, "port=%u" },
        { Opt_rsize, "rsize=%u" },
@@ -512,7 +515,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
                { NFS_MOUNT_NONLM, ",nolock", "" },
                { NFS_MOUNT_NOACL, ",noacl", "" },
                { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
-                { NFS_MOUNT_UNSHARED, ",nosharecache", ""},
+                { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
+                { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
                { 0, NULL, NULL }
        };
        const struct proc_nfs_info *nfs_infop;
@@ -1033,6 +1037,12 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_nosharecache:
                        mnt->flags |= NFS_MOUNT_UNSHARED;
                        break;
+                case Opt_resvport:
+                        mnt->flags &= ~NFS_MOUNT_NORESVPORT;
+                        break;
+                case Opt_noresvport:
+                        mnt->flags |= NFS_MOUNT_NORESVPORT;
+                        break;
                /*
                 * options that take numeric values
@@ -1327,8 +1337,14 @@ out_security_failure:
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                         struct nfs_fh *root_fh)
 {
-        struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address;
+        struct nfs_mount_request request = {
-        char *hostname;
+                .sap            = (struct sockaddr *)
+                                                &args->mount_server.address,
+                .dirpath        = args->nfs_server.export_path,
+                .protocol       = args->mount_server.protocol,
+                .fh             = root_fh,
+                .noresvport     = args->flags & NFS_MOUNT_NORESVPORT,
+        };
        int status;
        if (args->mount_server.version == 0) {
@@ -1337,42 +1353,38 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                else
                        args->mount_server.version = NFS_MNT_VERSION;
        }
+        request.version = args->mount_server.version;
        if (args->mount_server.hostname)
-                hostname = args->mount_server.hostname;
+                request.hostname = args->mount_server.hostname;
        else
-                hostname = args->nfs_server.hostname;
+                request.hostname = args->nfs_server.hostname;
        /*
         * Construct the mount server's address.
         */
        if (args->mount_server.address.ss_family == AF_UNSPEC) {
-                memcpy(sap, &args->nfs_server.address,
+                memcpy(request.sap, &args->nfs_server.address,
                       args->nfs_server.addrlen);
                args->mount_server.addrlen = args->nfs_server.addrlen;
        }
+        request.salen = args->mount_server.addrlen;
        /*
         * autobind will be used if mount_server.port == 0
         */
-        nfs_set_port(sap, args->mount_server.port);
+        nfs_set_port(request.sap, args->mount_server.port);
        /*
         * Now ask the mount server to map our export path
         * to a file handle.
         */
-        status = nfs_mount(sap,
+        status = nfs_mount(&request);
-                           args->mount_server.addrlen,
-                           hostname,
-                           args->nfs_server.export_path,
-                           args->mount_server.version,
-                           args->mount_server.protocol,
-                           root_fh);
        if (status == 0)
                return 0;
        dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
-                        hostname, status);
+                        request.hostname, status);
        return status;
 }
@@ -2419,7 +2431,7 @@ static void nfs4_kill_super(struct super_block *sb)
 {
        struct nfs_server *server = NFS_SB(sb);
-        nfs_return_all_delegations(sb);
+        nfs_super_return_all_delegations(sb);
        kill_anon_super(sb);
        nfs4_renewd_prepare_shutdown(server);
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index c11f5375d7c1..04133aacb1e5 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -29,8 +29,8 @@
 MODULE_LICENSE("GPL");
-EXPORT_SYMBOL(nfsacl_encode);
+EXPORT_SYMBOL_GPL(nfsacl_encode);
-EXPORT_SYMBOL(nfsacl_decode);
+EXPORT_SYMBOL_GPL(nfsacl_decode);
 struct nfsacl_encode_desc {
        struct xdr_array2_desc desc;
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b1acbd6ab6fb..b27451909dff 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags)
                return ERR_PTR(error);
        if (flags == O_RDWR)
-                error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE);
+                error = may_open(&nd.path, MAY_READ|MAY_WRITE,
+                                           FMODE_READ|FMODE_WRITE);
        else
-                error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
+                error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
        if (!error)
                return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 0184fe9b514c..c903e04aa217 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
        ret = set_groups(new, gi);
        put_group_info(gi);
-        if (!ret)
+        if (ret < 0)
                goto error;
-        if (new->uid)
+        if (new->fsuid)
                new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
        else
                new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 094747a1227c..c464181b5994 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,9 +53,6 @@
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
-/* declarations */
-static const struct rpc_call_ops nfs4_cb_null_ops;
 /* Index of predefined Linux callback client operations */
 enum {
@@ -358,6 +355,7 @@ static struct rpc_program cb_program = {
                .nrvers         = ARRAY_SIZE(nfs_cb_version),
                .version        = nfs_cb_version,
                .stats          = &cb_stats,
+                .pipe_dir_name  = "/nfsd4_cb",
 };
 /* Reference counting, callback cleanup, etc., all look racy as heck.
@@ -382,8 +380,9 @@ static int do_probe_callback(void *data)
                .program        = &cb_program,
                .prognumber     = cb->cb_prog,
                .version        = nfs_cb_version[1]->number,
-                .authflavor     = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
+                .authflavor     = clp->cl_flavor,
                .flags          = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
+                .client_name    = clp->cl_principal,
        };
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
@@ -392,6 +391,11 @@ static int do_probe_callback(void *data)
        struct rpc_clnt *client;
        int status;
+        if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
+                status = nfserr_cb_path_down;
+                goto out_err;
+        }
        /* Initialize address */
        memset(&addr, 0, sizeof(addr));
        addr.sin_family = AF_INET;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 669461e291ae..9fa60a3ad48c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -946,6 +946,11 @@ encode_op:
                        nfsd4_encode_operation(resp, op);
                        status = op->status;
                }
+                dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
+                        args->ops, args->opcnt, resp->opcnt, op->opnum,
+                        be32_to_cpu(status));
                if (cstate->replay_owner) {
                        nfs4_put_stateowner(cstate->replay_owner);
                        cstate->replay_owner = NULL;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0f9d6efaa62b..74f7b67567fd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
        md5_to_hex(dname, cksum.data);
-        kfree(cksum.data);
        status = nfs_ok;
 out:
+        kfree(cksum.data);
        crypto_free_hash(desc.tfm);
 out_no_tfm:
        return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bf4cd46a5a11..88db7d3ec120 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -54,6 +54,7 @@
 #include <linux/mutex.h>
 #include <linux/lockd/bind.h>
 #include <linux/module.h>
+#include <linux/sunrpc/svcauth_gss.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -377,6 +378,7 @@ free_client(struct nfs4_client *clp)
        shutdown_callback_client(clp);
        if (clp->cl_cred.cr_group_info)
                put_group_info(clp->cl_cred.cr_group_info);
+        kfree(clp->cl_principal);
        kfree(clp->cl_name.data);
        kfree(clp);
 }
@@ -696,6 +698,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        unsigned int            strhashval;
        struct nfs4_client      *conf, *unconf, *new;
        __be32                  status;
+        char                    *princ;
        char                    dname[HEXDIR_LEN];
        
        if (!check_name(clname))
@@ -783,6 +786,15 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        }
        copy_verf(new, &clverifier);
        new->cl_addr = sin->sin_addr.s_addr;
+        new->cl_flavor = rqstp->rq_flavor;
+        princ = svc_gss_principal(rqstp);
+        if (princ) {
+                new->cl_principal = kstrdup(princ, GFP_KERNEL);
+                if (new->cl_principal == NULL) {
+                        free_client(new);
+                        goto out;
+                }
+        }
        copy_cred(&new->cl_cred, &rqstp->rq_cred);
        gen_confirm(new);
        gen_callback(new, setclid);
@@ -2404,6 +2416,26 @@ out:
 #define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
 #define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
+static inline u64
+end_offset(u64 start, u64 len)
+{
+        u64 end;
+        end = start + len;
+        return end >= start ? end: NFS4_MAX_UINT64;
+}
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+        u64 end;
+        BUG_ON(!len);
+        end = start + len;
+        return end > start ? end - 1: NFS4_MAX_UINT64;
+}
 #define lockownerid_hashval(id) \
        ((id) & LOCK_HASH_MASK)
@@ -2423,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
 static struct nfs4_stateid *
 find_stateid(stateid_t *stid, int flags)
 {
-        struct nfs4_stateid *local = NULL;
+        struct nfs4_stateid *local;
        u32 st_id = stid->si_stateownerid;
        u32 f_id = stid->si_fileid;
        unsigned int hashval;
        dprintk("NFSD: find_stateid flags 0x%x\n",flags);
-        if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+        if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
                hashval = stateid_hashval(st_id, f_id);
                list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
                        if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2437,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags)
                                return local;
                }
        } 
-        if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+        if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
                hashval = stateid_hashval(st_id, f_id);
                list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
                        if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2506,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
                deny->ld_clientid.cl_id = 0;
        }
        deny->ld_start = fl->fl_start;
-        deny->ld_length = ~(u64)0;
+        deny->ld_length = NFS4_MAX_UINT64;
-        if (fl->fl_end != ~(u64)0)
+        if (fl->fl_end != NFS4_MAX_UINT64)
                deny->ld_length = fl->fl_end - fl->fl_start + 1;        
        deny->ld_type = NFS4_READ_LT;
        if (fl->fl_type != F_RDLCK)
@@ -2604,7 +2637,7 @@ out:
 static int
 check_lock_length(u64 offset, u64 length)
 {
-        return ((length == 0)  || ((length != ~(u64)0) &&
+        return ((length == 0)  || ((length != NFS4_MAX_UINT64) &&
             LOFF_OVERFLOW(offset, length)));
 }
@@ -2724,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        file_lock.fl_lmops = &nfsd_posix_mng_ops;
        file_lock.fl_start = lock->lk_offset;
-        if ((lock->lk_length == ~(u64)0) || 
+        file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
-                        LOFF_OVERFLOW(lock->lk_offset, lock->lk_length))
-                file_lock.fl_end = ~(u64)0;
-        else
-                file_lock.fl_end = lock->lk_offset + lock->lk_length - 1;
        nfs4_transform_lock_offset(&file_lock);
        /*
@@ -2769,6 +2798,25 @@ out:
 }
 /*
+ * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
+ * so we do a temporary open here just to get an open file to pass to
+ * vfs_test_lock.  (Arguably perhaps test_lock should be done with an
+ * inode operation.)
+ */
+static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+{
+        struct file *file;
+        int err;
+        err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+        if (err)
+                return err;
+        err = vfs_test_lock(file, lock);
+        nfsd_close(file);
+        return err;
+}
+/*
 * LOCKT operation
 */
 __be32
@@ -2776,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
            struct nfsd4_lockt *lockt)
 {
        struct inode *inode;
-        struct file file;
        struct file_lock file_lock;
        int error;
        __be32 status;
@@ -2827,23 +2874,12 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        file_lock.fl_lmops = &nfsd_posix_mng_ops;
        file_lock.fl_start = lockt->lt_offset;
-        if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
+        file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
-                file_lock.fl_end = ~(u64)0;
-        else
-                file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1;
        nfs4_transform_lock_offset(&file_lock);
-        /* vfs_test_lock uses the struct file _only_ to resolve the inode.
-         * since LOCKT doesn't require an OPEN, and therefore a struct
-         * file may not exist, pass vfs_test_lock a struct file with
-         * only the dentry:inode set.
-         */
-        memset(&file, 0, sizeof (struct file));
-        file.f_path.dentry = cstate->current_fh.fh_dentry;
        status = nfs_ok;
-        error = vfs_test_lock(&file, &file_lock);
+        error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
        if (error) {
                status = nfserrno(error);
                goto out;
@@ -2894,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        file_lock.fl_lmops = &nfsd_posix_mng_ops;
        file_lock.fl_start = locku->lu_offset;
-        if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
+        file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
-                file_lock.fl_end = ~(u64)0;
-        else
-                file_lock.fl_end = locku->lu_offset + locku->lu_length - 1;
        nfs4_transform_lock_offset(&file_lock);
        /*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index afcdf4b76843..f65953be39c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfs/nfs4xdr.c
- *
 *  Server-side XDR for NFSv4
 *
 *  Copyright (c) 2002 The Regents of the University of Michigan.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77d7b8c531a6..3d93b2064ce5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
 static ssize_t write_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Svc] = write_svc,
        [NFSD_Add] = write_add,
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
        [NFSD_Getfd] = write_getfd,
        [NFSD_Getfs] = write_getfs,
        [NFSD_Fh] = write_filehandle,
-        [NFSD_FO_UnlockIP] = failover_unlock_ip,
+        [NFSD_FO_UnlockIP] = write_unlock_ip,
-        [NFSD_FO_UnlockFS] = failover_unlock_fs,
+        [NFSD_FO_UnlockFS] = write_unlock_fs,
        [NFSD_Threads] = write_threads,
        [NFSD_Pool_Threads] = write_pool_threads,
        [NFSD_Versions] = write_versions,
@@ -176,10 +175,24 @@ static const struct file_operations exports_operations = {
 /*----------------------------------------------------------------------------*/
 /*
 * payload - write methods
- * If the method has a response, the response should be put in buf,
- * and the length returned.  Otherwise return 0 or and -error.
 */
+/**
+ * write_svc - Start kernel's NFSD server
+ *
+ * Deprecated.  /proc/fs/nfsd/threads is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_svc
+ *                              svc_port:       port number of this
+ *                                              server's listener
+ *                              svc_nthreads:   number of threads to start
+ *                      size:   size in bytes of passed in nfsctl_svc
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_svc(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_svc *data;
@@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size)
        return nfsd_svc(data->svc_port, data->svc_nthreads);
 }
+/**
+ * write_add - Add or modify client entry in auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_client
+ *                              cl_ident:       '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client
+ *                              cl_naddr:       no. of items in cl_addrlist
+ *                              cl_addrlist:    array of client addresses
+ *                              cl_fhkeytype:   ignored
+ *                              cl_fhkeylen:    ignored
+ *                              cl_fhkey:       ignored
+ *                      size:   size in bytes of passed in nfsctl_client
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_add(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_client *data;
@@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size)
        return exp_addclient(data);
 }
+/**
+ * write_del - Remove client from auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_client
+ *                              cl_ident:       '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client
+ *                              cl_naddr:       ignored
+ *                              cl_addrlist:    ignored
+ *                              cl_fhkeytype:   ignored
+ *                              cl_fhkeylen:    ignored
+ *                              cl_fhkey:       ignored
+ *                      size:   size in bytes of passed in nfsctl_client
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_del(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_client *data;
@@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size)
        return exp_delclient(data);
 }
+/**
+ * write_export - Export part or all of a local file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_export
+ *                              ex_client:      '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client allowed to access
+ *                                              this export
+ *                              ex_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              ex_dev:         fsid to use for this export
+ *                              ex_ino:         ignored
+ *                              ex_flags:       export flags for this export
+ *                              ex_anon_uid:    UID to use for anonymous
+ *                                              requests
+ *                              ex_anon_gid:    GID to use for anonymous
+ *                                              requests
+ *                      size:   size in bytes of passed in nfsctl_export
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_export(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_export *data;
@@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size)
        return exp_export(data);
 }
+/**
+ * write_unexport - Unexport a previously exported file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_export
+ *                              ex_client:      '\0'-terminated C string
+ *                                              containing domain name
+ *                                              of client no longer allowed
+ *                                              to access this export
+ *                              ex_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              ex_dev:         ignored
+ *                              ex_ino:         ignored
+ *                              ex_flags:       ignored
+ *                              ex_anon_uid:    ignored
+ *                              ex_anon_gid:    ignored
+ *                      size:   size in bytes of passed in nfsctl_export
+ * Output:
+ *      On success:     returns zero
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_unexport(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_export *data;
@@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size)
        return exp_unexport(data);
 }
+/**
+ * write_getfs - Get a variable-length NFS file handle by path
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_fsparm
+ *                              gd_addr:        socket address of client
+ *                              gd_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              gd_maxlen:      maximum size of returned file
+ *                                              handle
+ *                      size:   size in bytes of passed in nfsctl_fsparm
+ * Output:
+ *      On success:     passed-in buffer filled with a knfsd_fh structure
+ *                      (a variable-length raw NFS file handle);
+ *                      return code is the size in bytes of the file handle
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfs(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_fsparm *data;
@@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
        return err;
 }
+/**
+ * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *                      buf:    struct nfsctl_fdparm
+ *                              gd_addr:        socket address of client
+ *                              gd_path:        '\0'-terminated C string
+ *                                              containing pathname of
+ *                                              directory in local file system
+ *                              gd_version:     fdparm structure version
+ *                      size:   size in bytes of passed in nfsctl_fdparm
+ * Output:
+ *      On success:     passed-in buffer filled with nfsctl_res
+ *                      (a fixed-length raw NFS file handle);
+ *                      return code is the size in bytes of the file handle
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 {
        struct nfsctl_fdparm *data;
@@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
        return err;
 }
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_ip - Release all locks used by a client
+ *
+ * Experimental.
+ *
+ * Input:
+ *                      buf:    '\n'-terminated C string containing a
+ *                              presentation format IPv4 address
+ *                      size:   length of C string in @buf
+ * Output:
+ *      On success:     returns zero if all specified locks were released;
+ *                      returns one if one or more locks were not released
+ *      On error:       return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in
+ */
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 {
        struct sockaddr_in sin = {
                .sin_family     = AF_INET,
@@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
        return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
 }
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
+/**
+ * write_unlock_fs - Release all locks on a local file system
+ *
+ * Experimental.
+ *
+ * Input:
+ *                      buf:    '\n'-terminated C string containing the
+ *                              absolute pathname of a local file system
+ *                      size:   length of C string in @buf
+ * Output:
+ *      On success:     returns zero if all specified locks were released;
+ *                      returns one if one or more locks were not released
+ *      On error:       return code is negative errno value
+ */
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 {
        struct path path;
        char *fo_path;
@@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
        if (error)
                return error;
+        /*
+         * XXX: Needs better sanity checking.  Otherwise we could end up
+         * releasing locks on the wrong file system.
+         *
+         * For example:
+         * 1.  Does the path refer to a directory?
+         * 2.  Is that directory a mount point, or
+         * 3.  Is that directory the root of an exported file system?
+         */
        error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
        path_put(&path);
        return error;
 }
+/**
+ * write_filehandle - Get a variable-length NFS file handle by path
+ *
+ * On input, the buffer contains a '\n'-terminated C string comprised of
+ * three alphanumeric words separated by whitespace.  The string may
+ * contain escape sequences.
+ *
+ * Input:
+ *                      buf:
+ *                              domain:         client domain name
+ *                              path:           export pathname
+ *                              maxsize:        numeric maximum size of
+ *                                              @buf
+ *                      size:   length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing a ASCII hex text version
+ *                      of the NFS file handle;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is negative errno value
+ */
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 {
-        /* request is:
-         *   domain path maxsize
-         * response is
-         *   filehandle
-         *
-         * qword quoting is used, so filehandle will be \x....
-         */
        char *dname, *path;
        int uninitialized_var(maxsize);
        char *mesg = buf;
@@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        dname = mesg;
        len = qword_get(&mesg, dname, size);
-        if (len <= 0) return -EINVAL;
+        if (len <= 0)
+                return -EINVAL;
        
        path = dname+len+1;
        len = qword_get(&mesg, path, size);
-        if (len <= 0) return -EINVAL;
+        if (len <= 0)
+                return -EINVAL;
        len = get_int(&mesg, &maxsize);
        if (len)
@@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
        if (len)
                return len;
        
-        mesg = buf; len = SIMPLE_TRANSACTION_LIMIT;
+        mesg = buf;
+        len = SIMPLE_TRANSACTION_LIMIT;
        qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
        mesg[-1] = '\n';
        return mesg - buf;      
 }
+/**
+ * write_threads - Start NFSD, or report the current number of running threads
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string numeric value representing the number of
+ *                      running NFSD threads;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the
+ *                                      number of NFSD threads to start
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     NFS service is started;
+ *                      passed-in buffer filled with '\n'-terminated C
+ *                      string numeric value representing the number of
+ *                      running NFSD threads;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_threads(struct file *file, char *buf, size_t size)
 {
-        /* if size > 0, look for a number of threads and call nfsd_svc
-         * then write out number of threads as reply
-         */
        char *mesg = buf;
        int rv;
        if (size > 0) {
@@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
                rv = get_int(&mesg, &newthreads);
                if (rv)
                        return rv;
-                if (newthreads <0)
+                if (newthreads < 0)
                        return -EINVAL;
-                rv = nfsd_svc(2049, newthreads);
+                rv = nfsd_svc(NFS_PORT, newthreads);
                if (rv)
                        return rv;
        }
@@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
+/**
+ * write_pool_threads - Set or report the current number of threads per pool
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing whitespace-
+ *                                      separated unsigned integer values
+ *                                      representing the number of NFSD
+ *                                      threads to start in each pool
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing integer values representing the
+ *                      number of NFSD threads in each pool;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 {
        /* if size > 0, look for an array of number of threads per node
@@ -517,10 +780,6 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
-        /*
-         * Format:
-         *   [-/+]vers [-/+]vers ...
-         */
        char *mesg = buf;
        char *vers, sign;
        int len, num;
@@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
        return len;
 }
+/**
+ * write_versions - Set or report the available NFS protocol versions
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing positive or negative integer
+ *                      values representing the current status of each
+ *                      protocol version;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing whitespace-
+ *                                      separated positive or negative
+ *                                      integer values representing NFS
+ *                                      protocol versions to enable ("+n")
+ *                                      or disable ("-n")
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     status of zero or more protocol versions has
+ *                      been updated; passed-in buffer filled with
+ *                      '\n'-terminated C string containing positive
+ *                      or negative integer values representing the
+ *                      current status of each protocol version;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_versions(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
@@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
        return -EINVAL;
 }
+/**
+ * write_ports - Pass a socket file descriptor or transport name to listen on
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ * Output:
+ *      On success:     passed-in buffer filled with a '\n'-terminated C
+ *                      string containing a whitespace-separated list of
+ *                      named NFSD listeners;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing a bound
+ *                                      but unconnected socket that is to be
+ *                                      used as an NFSD listener
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     NFS service is started;
+ *                      passed-in buffer filled with a '\n'-terminated C
+ *                      string containing a unique alphanumeric name of
+ *                      the listener;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing a "-" followed
+ *                                      by an integer value representing a
+ *                                      previously passed in socket file
+ *                                      descriptor
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     NFS service no longer listens on that socket;
+ *                      passed-in buffer filled with a '\n'-terminated C
+ *                      string containing a unique name of the listener;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing a transport
+ *                                      name and an unsigned integer value
+ *                                      representing the port to listen on,
+ *                                      separated by whitespace
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     returns zero; NFS service is started
+ *      On error:       return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing a "-" followed
+ *                                      by a transport name and an unsigned
+ *                                      integer value representing the port
+ *                                      to listen on, separated by whitespace
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     returns zero; NFS service no longer listens
+ *                      on that transport
+ *      On error:       return code is a negative errno value
+ */
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
@@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 int nfsd_max_blksize;
+/**
+ * write_maxblksize - Set or report the current NFS blksize
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the new
+ *                                      NFS blksize
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C string
+ *                      containing numeric value of the current NFS blksize
+ *                      setting;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 {
        char *mesg = buf;
@@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
+/**
+ * write_leasetime - Set or report the current NFSv4 lease time
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing an unsigned
+ *                                      integer value representing the new
+ *                                      NFSv4 lease expiry time
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C
+ *                      string containing unsigned integer value of the
+ *                      current lease expiry time;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
@@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
        return strlen(buf);
 }
+/**
+ * write_recoverydir - Set or report the pathname of the recovery directory
+ *
+ * Input:
+ *                      buf:            ignored
+ *                      size:           zero
+ *
+ * OR
+ *
+ * Input:
+ *                      buf:            C string containing the pathname
+ *                                      of the directory on a local file
+ *                                      system containing permanent NFSv4
+ *                                      recovery data
+ *                      size:           non-zero length of C string in @buf
+ * Output:
+ *      On success:     passed-in buffer filled with '\n'-terminated C string
+ *                      containing the current recovery pathname setting;
+ *                      return code is the size in bytes of the string
+ *      On error:       return code is zero or a negative errno value
+ */
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 {
        ssize_t rv;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f0da7d9c3a92..9f1ca17293d3 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -258,14 +258,32 @@ out:
        return error;
 }
-/*
+/**
- * Perform sanity checks on the dentry in a client's file handle.
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
 *
- * Note that the file handle dentry may need to be freed even after
+ * fh_verify() may be called multiple times on a given filehandle, for
- * an error return.
+ * example, when processing an NFSv4 compound.  The first call will look
+ * up a dentry using the on-the-wire filehandle.  Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
 *
- * This is only called at the start of an nfsproc call, so fhp points to
+ * @type specifies the type of object expected using one of the S_IF*
- * a svc_fh which is all 0 except for the over-the-wire file handle.
+ * constants defined in include/linux/stat.h.  The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
+ *
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * include/linux/nfsd/nfsd.h.
 */
 __be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
@@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                                goto retry;
                        break;
                }
+        } else if (exp->ex_flags & NFSEXP_FSID) {
+                fsid_type = FSID_NUM;
        } else if (exp->ex_uuid) {
                if (fhp->fh_maxsize >= 64) {
                        if (root_export)
@@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
                        else
                                fsid_type = FSID_UUID4_INUM;
                }
-        } else if (exp->ex_flags & NFSEXP_FSID)
+        } else if (!old_valid_dev(ex_dev))
-                fsid_type = FSID_NUM;
-        else if (!old_valid_dev(ex_dev))
                /* for newer device numbers, we must use a newer fsid format */
                fsid_type = FSID_ENCODE_DEV;
        else
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cffeca7acef..6f7f26351227 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -622,6 +622,7 @@ nfserrno (int errno)
                { nfserr_badname, -ESRCH },
                { nfserr_io, -ETXTBSY },
                { nfserr_notsupp, -EOPNOTSUPP },
+                { nfserr_toosmall, -ETOOSMALL },
        };
        int     i;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b365..6e50aaa56ca2 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
        return err;
 }
-        
 static int
 nfsd_sync(struct file *filp)
@@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        dirp = dentry->d_inode;
        err = nfserr_notdir;
-        if(!dirp->i_op || !dirp->i_op->lookup)
+        if (!dirp->i_op->lookup)
                goto out;
        /*
         * Check whether the response file handle has been verified yet.
@@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        /* Get all the sanity checks out of the way before
         * we lock the parent. */
        err = nfserr_notdir;
-        if(!dirp->i_op || !dirp->i_op->lookup)
+        if (!dirp->i_op->lookup)
                goto out;
        fh_lock_nested(fhp, I_MUTEX_PARENT);
@@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
        inode = dentry->d_inode;
        err = nfserr_inval;
-        if (!inode->i_op || !inode->i_op->readlink)
+        if (!inode->i_op->readlink)
                goto out;
        touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
        size_t size;
        int error;
-        if (!IS_POSIXACL(inode) || !inode->i_op ||
+        if (!IS_POSIXACL(inode) ||
            !inode->i_op->setxattr || !inode->i_op->removexattr)
                return -EOPNOTSUPP;
        switch(type) {
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
new file mode 100644
index 000000000000..50914d7303c6
--- /dev/null
+++ b/fs/notify/Kconfig
@@ -0,0 +1,2 @@
+source "fs/notify/dnotify/Kconfig"
+source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
new file mode 100644
index 000000000000..5a95b6010ce7
--- /dev/null
+++ b/fs/notify/Makefile
@@ -0,0 +1,2 @@
+obj-y                   += dnotify/
+obj-y                   += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
new file mode 100644
index 000000000000..26adf5dfa646
--- /dev/null
+++ b/fs/notify/dnotify/Kconfig
@@ -0,0 +1,10 @@
+config DNOTIFY
+        bool "Dnotify support"
+        default y
+        help
+          Dnotify is a directory-based per-fd file change notification system
+          that uses signals to communicate events to user-space.  There exist
+          superior alternatives, but some applications may still rely on
+          dnotify.
+          If unsure, say Y.
diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
new file mode 100644
index 000000000000..f145251dcadb
--- /dev/null
+++ b/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DNOTIFY)           += dnotify.o
diff --git a/fs/dnotify.c b/fs/notify/dnotify/dnotify.c
index 676073b8dda5..b0aa2cde80bd 100644
--- a/fs/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
        dn->dn_next = inode->i_dnotify;
        inode->i_dnotify = dn;
        spin_unlock(&inode->i_lock);
-        if (filp->f_op && filp->f_op->dir_notify)
-                return filp->f_op->dir_notify(filp, arg);
        return 0;
 out_free:
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
new file mode 100644
index 000000000000..446792841023
--- /dev/null
+++ b/fs/notify/inotify/Kconfig
@@ -0,0 +1,27 @@
+config INOTIFY
+        bool "Inotify file change notification support"
+        default y
+        ---help---
+          Say Y here to enable inotify support.  Inotify is a file change
+          notification system and a replacement for dnotify.  Inotify fixes
+          numerous shortcomings in dnotify and introduces several new features
+          including multiple file events, one-shot support, and unmount
+          notification.
+          For more information, see <file:Documentation/filesystems/inotify.txt>
+          If unsure, say Y.
+config INOTIFY_USER
+        bool "Inotify support for userspace"
+        depends on INOTIFY
+        default y
+        ---help---
+          Say Y here to enable inotify support for userspace, including the
+          associated system calls.  Inotify allows monitoring of both files and
+          directories via a single open fd.  Events are read from the file
+          descriptor, which is also select()- and poll()-able.
+          For more information, see <file:Documentation/filesystems/inotify.txt>
+          If unsure, say Y.
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
new file mode 100644
index 000000000000..e290f3bb9d8d
--- /dev/null
+++ b/fs/notify/inotify/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INOTIFY)           += inotify.o
+obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
diff --git a/fs/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..dae3f28f30d4 100644
--- a/fs/inotify.c
+++ b/fs/notify/inotify/inotify.c
diff --git a/fs/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e2425bbd871f..81b8644b0136 100644
--- a/fs/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -76,10 +76,10 @@ struct inotify_device {
        struct mutex            ev_mutex;       /* protects event queue */
        struct mutex            up_mutex;       /* synchronizes watch updates */
        struct list_head        events;         /* list of queued events */
-        atomic_t                count;          /* reference count */
        struct user_struct      *user;          /* user who opened this dev */
        struct inotify_handle   *ih;            /* inotify handle */
        struct fasync_struct    *fa;            /* async notification */
+        atomic_t                count;          /* reference count */
        unsigned int            queue_size;     /* size of the queue (bytes) */
        unsigned int            event_count;    /* number of pending events */
        unsigned int            max_events;     /* maximum number of events */
@@ -704,7 +704,7 @@ fput_and_out:
        return ret;
 }
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
 {
        struct file *filp;
        struct inotify_device *dev;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e2772..86bef156cf0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
                ni->allocated_size = sle64_to_cpu(
                                a->data.non_resident.allocated_size);
        }
-        /* Setup the operations for this attribute inode. */
-        vi->i_op = NULL;
-        vi->i_fop = NULL;
        if (NInoMstProtected(ni))
                vi->i_mapping->a_ops = &ntfs_mst_aops;
        else
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3c..01596079dd63 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
        alloc.o                 \
        aops.o                  \
+        blockcheck.o            \
        buffer_head_io.o        \
        dcache.o                \
        dir.o                   \
@@ -35,8 +36,14 @@ ocfs2-objs := \
        sysfile.o               \
        uptodate.o              \
        ver.o                   \
+        quota_local.o           \
+        quota_global.o          \
        xattr.o
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 000000000000..12dfb44c22e5
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+#include "xattr.h"
+#include "acl.h"
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+        int n, count;
+        struct posix_acl *acl;
+        if (!value)
+                return NULL;
+        if (size < sizeof(struct posix_acl_entry))
+                return ERR_PTR(-EINVAL);
+        count = size / sizeof(struct posix_acl_entry);
+        if (count < 0)
+                return ERR_PTR(-EINVAL);
+        if (count == 0)
+                return NULL;
+        acl = posix_acl_alloc(count, GFP_NOFS);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        for (n = 0; n < count; n++) {
+                struct ocfs2_acl_entry *entry =
+                        (struct ocfs2_acl_entry *)value;
+                acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+                acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+                acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+                value += sizeof(struct posix_acl_entry);
+        }
+        return acl;
+}
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+        struct ocfs2_acl_entry *entry = NULL;
+        char *ocfs2_acl;
+        size_t n;
+        *size = acl->a_count * sizeof(struct posix_acl_entry);
+        ocfs2_acl = kmalloc(*size, GFP_NOFS);
+        if (!ocfs2_acl)
+                return ERR_PTR(-ENOMEM);
+        entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+        for (n = 0; n < acl->a_count; n++, entry++) {
+                entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+                entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+                entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+        }
+        return ocfs2_acl;
+}
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+                                              int type,
+                                              struct buffer_head *di_bh)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int name_index;
+        char *value = NULL;
+        struct posix_acl *acl;
+        int retval;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+        if (retval > 0) {
+                value = kmalloc(retval, GFP_NOFS);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+                                                "", value, retval);
+        }
+        if (retval > 0)
+                acl = ocfs2_acl_from_xattr(value, retval);
+        else if (retval == -ENODATA || retval == 0)
+                acl = NULL;
+        else
+                acl = ERR_PTR(retval);
+        kfree(value);
+        return acl;
+}
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct posix_acl *acl;
+        int ret;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                acl = ERR_PTR(ret);
+                return acl;
+        }
+        acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+        ocfs2_inode_unlock(inode, 0);
+        brelse(di_bh);
+        return acl;
+}
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+                         struct inode *inode,
+                         struct buffer_head *di_bh,
+                         int type,
+                         struct posix_acl *acl,
+                         struct ocfs2_alloc_context *meta_ac,
+                         struct ocfs2_alloc_context *data_ac)
+{
+        int name_index;
+        void *value = NULL;
+        size_t size = 0;
+        int ret;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+                if (acl) {
+                        mode_t mode = inode->i_mode;
+                        ret = posix_acl_equiv_mode(acl, &mode);
+                        if (ret < 0)
+                                return ret;
+                        else {
+                                inode->i_mode = mode;
+                                if (ret == 0)
+                                        acl = NULL;
+                        }
+                }
+                break;
+        case ACL_TYPE_DEFAULT:
+                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EACCES : 0;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                value = ocfs2_acl_to_xattr(acl, &size);
+                if (IS_ERR(value))
+                        return (int)PTR_ERR(value);
+        }
+        if (handle)
+                ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+                                             "", value, size, 0,
+                                             meta_ac, data_ac);
+        else
+                ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+        kfree(value);
+        return ret;
+}
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                int ret = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return ret;
+        }
+        return -EAGAIN;
+}
+int ocfs2_acl_chmod(struct inode *inode)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl, *clone;
+        int ret;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        ret = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!ret)
+                ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+                                    clone, NULL, NULL);
+        posix_acl_release(clone);
+        return ret;
+}
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+                   struct inode *inode,
+                   struct inode *dir,
+                   struct buffer_head *di_bh,
+                   struct buffer_head *dir_bh,
+                   struct ocfs2_alloc_context *meta_ac,
+                   struct ocfs2_alloc_context *data_ac)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl = NULL;
+        int ret = 0;
+        if (!S_ISLNK(inode->i_mode)) {
+                if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                        acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+                                                   dir_bh);
+                        if (IS_ERR(acl))
+                                return PTR_ERR(acl);
+                }
+                if (!acl)
+                        inode->i_mode &= ~current->fs->umask;
+        }
+        if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+                struct posix_acl *clone;
+                mode_t mode;
+                if (S_ISDIR(inode->i_mode)) {
+                        ret = ocfs2_set_acl(handle, inode, di_bh,
+                                            ACL_TYPE_DEFAULT, acl,
+                                            meta_ac, data_ac);
+                        if (ret)
+                                goto cleanup;
+                }
+                clone = posix_acl_clone(acl, GFP_NOFS);
+                ret = -ENOMEM;
+                if (!clone)
+                        goto cleanup;
+                mode = inode->i_mode;
+                ret = posix_acl_create_masq(clone, &mode);
+                if (ret >= 0) {
+                        inode->i_mode = mode;
+                        if (ret > 0) {
+                                ret = ocfs2_set_acl(handle, inode,
+                                                    di_bh, ACL_TYPE_ACCESS,
+                                                    clone, meta_ac, data_ac);
+                        }
+                }
+                posix_acl_release(clone);
+        }
+cleanup:
+        posix_acl_release(acl);
+        return ret;
+}
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+                                          char *list,
+                                          size_t list_len,
+                                          const char *name,
+                                          size_t name_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+        return size;
+}
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+                                           char *list,
+                                           size_t list_len,
+                                           const char *name,
+                                           size_t name_len)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return 0;
+        if (list && size <= list_len)
+                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+        return size;
+}
+static int ocfs2_xattr_get_acl(struct inode *inode,
+                               int type,
+                               void *buffer,
+                               size_t size)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl;
+        int ret;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return -EOPNOTSUPP;
+        acl = ocfs2_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        ret = posix_acl_to_xattr(acl, buffer, size);
+        posix_acl_release(acl);
+        return ret;
+}
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+                                      const char *name,
+                                      void *buffer,
+                                      size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+                                       const char *name,
+                                       void *buffer,
+                                       size_t size)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+static int ocfs2_xattr_set_acl(struct inode *inode,
+                               int type,
+                               const void *value,
+                               size_t size)
+{
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct posix_acl *acl;
+        int ret = 0;
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EPERM;
+        if (value) {
+                acl = posix_acl_from_xattr(value, size);
+                if (IS_ERR(acl))
+                        return PTR_ERR(acl);
+                else if (acl) {
+                        ret = posix_acl_valid(acl);
+                        if (ret)
+                                goto cleanup;
+                }
+        } else
+                acl = NULL;
+        ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+cleanup:
+        posix_acl_release(acl);
+        return ret;
+}
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+                                      const char *name,
+                                      const void *value,
+                                      size_t size,
+                                      int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+                                       const char *name,
+                                       const void *value,
+                                       size_t size,
+                                       int flags)
+{
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .list   = ocfs2_xattr_list_acl_access,
+        .get    = ocfs2_xattr_get_acl_access,
+        .set    = ocfs2_xattr_set_acl_access,
+};
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .list   = ocfs2_xattr_list_acl_default,
+        .get    = ocfs2_xattr_get_acl_default,
+        .set    = ocfs2_xattr_set_acl_default,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 000000000000..8f6389ed4da5
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+#include <linux/posix_acl_xattr.h>
+struct ocfs2_acl_entry {
+        __le16 e_tag;
+        __le16 e_perm;
+        __le32 e_id;
+};
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+                          struct buffer_head *, struct buffer_head *,
+                          struct ocfs2_alloc_context *,
+                          struct ocfs2_alloc_context *);
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+#define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+        return 0;
+}
+static inline int ocfs2_init_acl(handle_t *handle,
+                                 struct inode *inode,
+                                 struct inode *dir,
+                                 struct buffer_head *di_bh,
+                                 struct buffer_head *dir_bh,
+                                 struct ocfs2_alloc_context *meta_ac,
+                                 struct ocfs2_alloc_context *data_ac)
+{
+        return 0;
+}
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394c..d861096c9d81 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -36,6 +37,7 @@
 #include "alloc.h"
 #include "aops.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -46,6 +48,7 @@
 #include "file.h"
 #include "super.h"
 #include "uptodate.h"
+#include "xattr.h"
 #include "buffer_head_io.h"
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
 static int ocfs2_dinode_sanity_check(struct inode *inode,
                                     struct ocfs2_extent_tree *et)
 {
-        int ret = 0;
+        struct ocfs2_dinode *di = et->et_object;
-        struct ocfs2_dinode *di;
        BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
-        di = et->et_object;
+        return 0;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                ret = -EIO;
-                ocfs2_error(inode->i_sb,
-                        "Inode %llu has invalid path root",
-                        (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        }
-        return ret;
 }
 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 {
-        struct ocfs2_xattr_value_root *xv = et->et_object;
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-        et->et_root_el = &xv->xr_list;
+        et->et_root_el = &vb->vb_xv->xr_list;
 }
 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
                                              u64 blkno)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *)et->et_object;
-        xv->xr_last_eb_blk = cpu_to_le64(blkno);
+        vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
 }
 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *) et->et_object;
-        return le64_to_cpu(xv->xr_last_eb_blk);
+        return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
 }
 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
                                              struct ocfs2_extent_tree *et,
                                              u32 clusters)
 {
-        struct ocfs2_xattr_value_root *xv =
+        struct ocfs2_xattr_value_buf *vb = et->et_object;
-                (struct ocfs2_xattr_value_root *)et->et_object;
-        le32_add_cpu(&xv->xr_clusters, clusters);
+        le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 }
 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
                                     struct inode *inode,
                                     struct buffer_head *bh,
+                                     ocfs2_journal_access_func access,
                                     void *obj,
                                     struct ocfs2_extent_tree_operations *ops)
 {
        et->et_ops = ops;
        et->et_root_bh = bh;
+        et->et_root_journal_access = access;
        if (!obj)
                obj = (void *)bh->b_data;
        et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
                                   struct inode *inode,
                                   struct buffer_head *bh)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+                                 NULL, &ocfs2_dinode_et_ops);
 }
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, NULL,
+        __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
-                                 &ocfs2_xattr_tree_et_ops);
+                                 NULL, &ocfs2_xattr_tree_et_ops);
 }
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_buf *vb)
-                                        struct ocfs2_xattr_value_root *xv)
 {
-        __ocfs2_init_extent_tree(et, inode, bh, xv,
+        __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
                                 &ocfs2_xattr_value_et_ops);
 }
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
        et->et_ops->eo_update_clusters(inode, et, clusters);
 }
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+                                               struct inode *inode,
+                                               struct ocfs2_extent_tree *et,
+                                               int type)
+{
+        return et->et_root_journal_access(handle, inode, et->et_root_bh,
+                                          type);
+}
 static inline int ocfs2_et_insert_check(struct inode *inode,
                                        struct ocfs2_extent_tree *et,
                                        struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
 #define OCFS2_MAX_PATH_DEPTH    5
 struct ocfs2_path {
-        int                     p_tree_depth;
+        int                             p_tree_depth;
-        struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
+        ocfs2_journal_access_func       p_root_access;
+        struct ocfs2_path_item          p_node[OCFS2_MAX_PATH_DEPTH];
 };
 #define path_root_bh(_path) ((_path)->p_node[0].bh)
 #define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
         */
        if (keep_root)
                depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+        else
+                path_root_access(path) = NULL;
        path->p_tree_depth = depth;
 }
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
        BUG_ON(path_root_el(dest) != path_root_el(src));
+        BUG_ON(path_root_access(dest) != path_root_access(src));
        ocfs2_reinit_path(dest, 1);
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
        int i;
        BUG_ON(path_root_bh(dest) != path_root_bh(src));
+        BUG_ON(path_root_access(dest) != path_root_access(src));
        for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
                brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
 }
 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
-                                         struct ocfs2_extent_list *root_el)
+                                         struct ocfs2_extent_list *root_el,
+                                         ocfs2_journal_access_func access)
 {
        struct ocfs2_path *path;
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
                get_bh(root_bh);
                path_root_bh(path) = root_bh;
                path_root_el(path) = root_el;
+                path_root_access(path) = access;
        }
        return path;
 }
+static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+        return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+                              path_root_access(path));
+}
+static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+        return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+                              et->et_root_journal_access);
+}
+/*
+ * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+static int ocfs2_path_bh_journal_access(handle_t *handle,
+                                        struct inode *inode,
+                                        struct ocfs2_path *path,
+                                        int idx)
+{
+        ocfs2_journal_access_func access = path_root_access(path);
+        if (!access)
+                access = ocfs2_journal_access;
+        if (idx)
+                access = ocfs2_journal_access_eb;
+        return access(handle, inode, path->p_node[idx].bh,
+                      OCFS2_JOURNAL_ACCESS_WRITE);
+}
 /*
 * Convenience function to journal all components in a path.
 */
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
                goto out;
        for(i = 0; i < path_num_items(path); i++) {
-                ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
        int                     c_split_covers_rec;
 };
+static int ocfs2_validate_extent_block(struct super_block *sb,
+                                       struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_extent_block *eb =
+                (struct ocfs2_extent_block *)bh->b_data;
+        mlog(0, "Validating extent block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+        if (rc) {
+                mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                return rc;
+        }
+        /*
+         * Errors after here are fatal.
+         */
+        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has bad signature %.*s",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            eb->h_signature);
+                return -EINVAL;
+        }
+        if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has an invalid h_blkno "
+                            "of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(eb->h_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Extent block #%llu has an invalid "
+                            "h_fs_generation of #%u",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(eb->h_fs_generation));
+                return -EINVAL;
+        }
+        return 0;
+}
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                            struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+                              ocfs2_validate_extent_block);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 /*
 * How many free extents have we got before we need more meta data?
 */
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
        last_eb_blk = ocfs2_et_get_last_eb_blk(et);
        if (last_eb_blk) {
-                retval = ocfs2_read_block(inode, last_eb_blk,
+                retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
-                                          &eb_bh);
                if (retval < 0) {
                        mlog_errno(retval);
                        goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
                        }
                        ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-                        status = ocfs2_journal_access(handle, inode, bhs[i],
+                        status = ocfs2_journal_access_eb(handle, inode, bhs[i],
-                                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                                         OCFS2_JOURNAL_ACCESS_CREATE);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        for(i = 0; i < new_blocks; i++) {
                bh = new_eb_bhs[i];
                eb = (struct ocfs2_extent_block *) bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+                /* ocfs2_create_new_meta_bhs() should create it right! */
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+                BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-                        status = -EIO;
-                        goto bail;
-                }
                eb_el = &eb->h_list;
-                status = ocfs2_journal_access(handle, inode, bh,
+                status = ocfs2_journal_access_eb(handle, inode, bh,
-                                              OCFS2_JOURNAL_ACCESS_CREATE);
+                                                 OCFS2_JOURNAL_ACCESS_CREATE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
         * journal_dirty erroring as it won't unless we've aborted the
         * handle (in which case we would never be here) so reserving
         * the write with journal_access is all we need to do. */
-        status = ocfs2_journal_access(handle, inode, *last_eb_bh,
+        status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        if (eb_bh) {
-                status = ocfs2_journal_access(handle, inode, eb_bh,
+                status = ocfs2_journal_access_eb(handle, inode, eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        }
        eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+        /* ocfs2_create_new_meta_bhs() should create it right! */
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-                status = -EIO;
-                goto bail;
-        }
        eb_el = &eb->h_list;
        root_el = et->et_root_el;
-        status = ocfs2_journal_access(handle, inode, new_eb_bh,
+        status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
                brelse(bh);
                bh = NULL;
-                status = ocfs2_read_block(inode, blkno, &bh);
+                status = ocfs2_read_extent_block(inode, blkno, &bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        status = -EIO;
-                        goto bail;
-                }
                el = &eb->h_list;
                if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
                brelse(bh);
                bh = NULL;
-                ret = ocfs2_read_block(inode, blkno, &bh);
+                ret = ocfs2_read_extent_block(inode, blkno, &bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
                eb = (struct ocfs2_extent_block *) bh->b_data;
                el = &eb->h_list;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EIO;
-                        goto out;
-                }
                if (le16_to_cpu(el->l_next_free_rec) >
                    le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
        root_bh = left_path->p_node[subtree_index].bh;
        BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           right_path->p_node[i].bh,
+                                                   right_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           left_path->p_node[i].bh,
+                                                   left_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
        *ret_left_path = NULL;
-        left_path = ocfs2_new_path(path_root_bh(right_path),
+        left_path = ocfs2_new_path_from_path(right_path);
-                                   path_root_el(right_path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                        return -EAGAIN;
                if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_journal_access_eb(handle, inode,
-                                                   path_leaf_bh(right_path),
+                                                      path_leaf_bh(right_path),
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                      OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
                 * We have to update i_last_eb_blk during the meta
                 * data delete.
                 */
-                ret = ocfs2_journal_access(handle, inode, et_root_bh,
+                ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
         */
        BUG_ON(right_has_empty && !del_right_subtree);
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           subtree_index);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           right_path->p_node[i].bh,
+                                                   right_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           left_path->p_node[i].bh,
+                                                   left_path, i);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2596,16 +2694,17 @@ out:
 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
                                            handle_t *handle,
-                                            struct buffer_head *bh,
+                                            struct ocfs2_path *path)
-                                            struct ocfs2_extent_list *el)
 {
        int ret;
+        struct buffer_head *bh = path_leaf_bh(path);
+        struct ocfs2_extent_list *el = path_leaf_el(path);
        if (!ocfs2_is_empty_extent(&el->l_recs[0]))
                return 0;
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                goto out;
        }
-        left_path = ocfs2_new_path(path_root_bh(path),
+        left_path = ocfs2_new_path_from_path(path);
-                                   path_root_el(path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
        ocfs2_cp_path(left_path, path);
-        right_path = ocfs2_new_path(path_root_bh(path),
+        right_path = ocfs2_new_path_from_path(path);
-                                    path_root_el(path));
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
                 * Caller might still want to make changes to the
                 * tree root, so re-add it to the journal here.
                 */
-                ret = ocfs2_journal_access(handle, inode,
+                ret = ocfs2_path_bh_journal_access(handle, inode,
-                                           path_root_bh(left_path),
+                                                   left_path, 0);
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                 * We have a path to the left of this one - it needs
                 * an update too.
                 */
-                left_path = ocfs2_new_path(path_root_bh(path),
+                left_path = ocfs2_new_path_from_path(path);
-                                           path_root_el(path));
                if (!left_path) {
                        ret = -ENOMEM;
                        mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
                 * it up front.
                 */
                ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
-                                                       path_leaf_bh(path),
+                                                       path);
-                                                       path_leaf_el(path));
                if (ret)
                        mlog_errno(ret);
                goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
        /* This function shouldn't be called for the rightmost leaf. */
        BUG_ON(right_cpos == 0);
-        right_path = ocfs2_new_path(path_root_bh(left_path),
+        right_path = ocfs2_new_path_from_path(left_path);
-                                    path_root_el(left_path));
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-                ret = ocfs2_journal_access(handle, inode, root_bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   right_path->p_node[i].bh,
+                                                           right_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   left_path->p_node[i].bh,
+                                                           left_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
                right_rec = &el->l_recs[index + 1];
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(left_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
        /* This function shouldn't be called for the leftmost leaf. */
        BUG_ON(left_cpos == 0);
-        left_path = ocfs2_new_path(path_root_bh(right_path),
+        left_path = ocfs2_new_path_from_path(right_path);
-                                   path_root_el(right_path));
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                root_bh = left_path->p_node[subtree_index].bh;
                BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
-                ret = ocfs2_journal_access(handle, inode, root_bh,
+                ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   subtree_index);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                for (i = subtree_index + 1;
                     i < path_num_items(right_path); i++) {
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   right_path->p_node[i].bh,
+                                                           right_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
                        }
-                        ret = ocfs2_journal_access(handle, inode,
+                        ret = ocfs2_path_bh_journal_access(handle, inode,
-                                                   left_path->p_node[i].bh,
+                                                           left_path, i);
-                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
                        has_empty_extent = 1;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           path_num_items(right_path) - 1);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                 * leftmost leaf.
                 */
                if (left_cpos) {
-                        left_path = ocfs2_new_path(path_root_bh(right_path),
+                        left_path = ocfs2_new_path_from_path(right_path);
-                                                   path_root_el(right_path));
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
        struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
        struct ocfs2_extent_rec *rec, *tmprec;
-        right_el = path_leaf_el(right_path);;
+        right_el = path_leaf_el(right_path);
        if (left_path)
                left_el = path_leaf_el(left_path);
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
        el = et->et_root_el;
-        ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                goto out_update_clusters;
        }
-        right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        right_path = ocfs2_new_path_from_et(et);
        if (!right_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
                 * ocfs2_rotate_tree_right() might have extended the
                 * transaction without re-journaling our tree root.
                 */
-                ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
+                ret = ocfs2_et_root_journal_access(handle, inode, et,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        goto out;
                if (left_cpos != 0) {
-                        left_path = ocfs2_new_path(path_root_bh(path),
+                        left_path = ocfs2_new_path_from_path(path);
-                                                   path_root_el(path));
                        if (!left_path)
                                goto out;
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                            le16_to_cpu(new_el->l_count)) {
                                bh = path_leaf_bh(left_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                ocfs2_error(inode->i_sb,
-                                                                 eb);
+                                            "Extent block #%llu has an "
+                                            "invalid l_next_free_rec of "
+                                            "%d.  It should have "
+                                            "matched the l_count of %d",
+                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                            le16_to_cpu(new_el->l_next_free_rec),
+                                            le16_to_cpu(new_el->l_count));
+                                status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                if (right_cpos == 0)
                        goto out;
-                right_path = ocfs2_new_path(path_root_bh(path),
+                right_path = ocfs2_new_path_from_path(path);
-                                            path_root_el(path));
                if (!right_path)
                        goto out;
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
                        if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
                                bh = path_leaf_bh(right_path);
                                eb = (struct ocfs2_extent_block *)bh->b_data;
-                                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+                                ocfs2_error(inode->i_sb,
-                                                                 eb);
+                                            "Extent block #%llu has an "
+                                            "invalid l_next_free_rec of %d",
+                                            (unsigned long long)le64_to_cpu(eb->h_blkno),
+                                            le16_to_cpu(new_el->l_next_free_rec));
+                                status = -EINVAL;
                                goto out;
                        }
                        rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                 * ocfs2_figure_insert_type() and ocfs2_add_branch()
                 * may want it later.
                 */
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
+                ret = ocfs2_read_extent_block(inode,
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
                return 0;
        }
-        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
        BUG_ON(num_bits > clusters_to_add);
-        /* reserve our write early -- insert_extent may update the inode */
+        /* reserve our write early -- insert_extent may update the tree root */
-        status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+        status = ocfs2_et_root_journal_access(handle, inode, et,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                              OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
        if (path->p_tree_depth) {
                struct ocfs2_extent_block *eb;
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+                ret = ocfs2_read_extent_block(inode,
-                                       &last_eb_bh);
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &last_eb_bh);
                if (ret) {
                        mlog_exit(ret);
                        goto out;
                }
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        ret = -EROFS;
-                        goto out;
-                }
                rightmost_el = &eb->h_list;
        } else
                rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
        if (et->et_ops == &ocfs2_dinode_et_ops)
                ocfs2_extent_map_trunc(inode, 0);
-        left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        left_path = ocfs2_new_path_from_et(et);
        if (!left_path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
        depth = path->p_tree_depth;
        if (depth > 0) {
-                ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+                ret = ocfs2_read_extent_block(inode,
-                                       &last_eb_bh);
+                                              ocfs2_et_get_last_eb_blk(et),
+                                              &last_eb_bh);
                if (ret < 0) {
                        mlog_errno(ret);
                        goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
                }
                if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
-                        left_path = ocfs2_new_path(path_root_bh(path),
+                        left_path = ocfs2_new_path_from_path(path);
-                                                   path_root_el(path));
                        if (!left_path) {
                                ret = -ENOMEM;
                                mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
        ocfs2_extent_map_trunc(inode, 0);
-        path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+        path = ocfs2_new_path_from_et(et);
        if (!path) {
                ret = -ENOMEM;
                mlog_errno(ret);
@@ -5255,6 +5348,78 @@ out:
        return ret;
 }
+int ocfs2_remove_btree_range(struct inode *inode,
+                             struct ocfs2_extent_tree *et,
+                             u32 cpos, u32 phys_cpos, u32 len,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+        int ret;
+        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac = NULL;
+        ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mutex_lock(&tl_inode->i_mutex);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
+                ret = __ocfs2_flush_truncate_log(osb);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_et_root_journal_access(handle, inode, et,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
+                                  dealloc);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ocfs2_et_update_clusters(inode, et, -len);
+        ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+        if (ret) {
+                mlog_errno(ret);
+                goto out_commit;
+        }
+        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        if (ret)
+                mlog_errno(ret);
+out_commit:
+        ocfs2_commit_trans(osb, handle);
+out:
+        mutex_unlock(&tl_inode->i_mutex);
+        if (meta_ac)
+                ocfs2_free_alloc_context(meta_ac);
+        return ret;
+}
 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
        struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
        start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-                status = -EIO;
-                goto bail;
-        }
+        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+         * by the underlying call to ocfs2_read_inode_block(), so any
+         * corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        tl_count = le16_to_cpu(tl->tl_count);
        mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
                        tl_count == 0,
@@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+        status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
        while (i >= 0) {
                /* Caller has given us at least enough credits to
                 * update the truncate log dinode */
-                status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+                status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
        BUG_ON(mutex_trylock(&tl_inode->i_mutex));
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-                status = -EIO;
-                goto out;
-        }
+        /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+         * by the underlying call to ocfs2_read_inode_block(), so any
+         * corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        num_to_flush = le16_to_cpu(tl->tl_used);
        mlog(0, "Flush %u records from truncate log #%llu\n",
             num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
                goto bail;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                iput(inode);
                mlog_errno(status);
@@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
        }
        di = (struct ocfs2_dinode *) tl_bh->b_data;
-        tl = &di->id2.i_dealloc;
-        if (!OCFS2_IS_VALID_DINODE(di)) {
-                OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
-                status = -EIO;
-                goto bail;
-        }
+        /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
+         * validated by the underlying call to ocfs2_read_inode_block(),
+         * so any corruption is a code bug */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+        tl = &di->id2.i_dealloc;
        if (le16_to_cpu(tl->tl_used)) {
                mlog(0, "We'll have %u logs to recover\n",
                     le16_to_cpu(tl->tl_used));
@@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
                 * tl_used. */
                tl->tl_used = 0;
+                ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
                status = ocfs2_write_block(osb, tl_bh, tl_inode);
                if (status < 0) {
                        mlog_errno(status);
@@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 */
 /*
- * Describes a single block free from a suballocator
+ * Describe a single bit freed from a suballocator.  For the block
+ * suballocators, it represents one block.  For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
 */
 struct ocfs2_cached_block_free {
        struct ocfs2_cached_block_free          *free_next;
@@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list {
        struct ocfs2_cached_block_free          *f_first;
 };
-static int ocfs2_free_cached_items(struct ocfs2_super *osb,
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
-                                   int sysfile_type,
+                                    int sysfile_type,
-                                   int slot,
+                                    int slot,
-                                   struct ocfs2_cached_block_free *head)
+                                    struct ocfs2_cached_block_free *head)
 {
        int ret;
        u64 bg_blkno;
@@ -5893,6 +6062,82 @@ out:
        return ret;
 }
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                u64 blkno, unsigned int bit)
+{
+        int ret = 0;
+        struct ocfs2_cached_block_free *item;
+        item = kmalloc(sizeof(*item), GFP_NOFS);
+        if (item == NULL) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                return ret;
+        }
+        mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+             bit, (unsigned long long)blkno);
+        item->free_blk = blkno;
+        item->free_bit = bit;
+        item->free_next = ctxt->c_global_allocator;
+        ctxt->c_global_allocator = item;
+        return ret;
+}
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+                                      struct ocfs2_cached_block_free *head)
+{
+        struct ocfs2_cached_block_free *tmp;
+        struct inode *tl_inode = osb->osb_tl_inode;
+        handle_t *handle;
+        int ret = 0;
+        mutex_lock(&tl_inode->i_mutex);
+        while (head) {
+                if (ocfs2_truncate_log_needs_flush(osb)) {
+                        ret = __ocfs2_flush_truncate_log(osb);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                break;
+                        }
+                }
+                handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+                                                head->free_bit);
+                ocfs2_commit_trans(osb, handle);
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        break;
+                }
+        }
+        mutex_unlock(&tl_inode->i_mutex);
+        while (head) {
+                /* Premature exit may have left some dangling items. */
+                tmp = head;
+                head = head->free_next;
+                kfree(tmp);
+        }
+        return ret;
+}
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt)
 {
@@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                if (fl->f_first) {
                        mlog(0, "Free items: (type %u, slot %d)\n",
                             fl->f_inode_type, fl->f_slot);
-                        ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
+                        ret2 = ocfs2_free_cached_blocks(osb,
-                                                       fl->f_slot, fl->f_first);
+                                                        fl->f_inode_type,
+                                                        fl->f_slot,
+                                                        fl->f_first);
                        if (ret2)
                                mlog_errno(ret2);
                        if (!ret)
@@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
                kfree(fl);
        }
+        if (ctxt->c_global_allocator) {
+                ret2 = ocfs2_free_cached_clusters(osb,
+                                                  ctxt->c_global_allocator);
+                if (ret2)
+                        mlog_errno(ret2);
+                if (!ret)
+                        ret = ret2;
+                ctxt->c_global_allocator = NULL;
+        }
        return ret;
 }
@@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
        eb = (struct ocfs2_extent_block *) bh->b_data;
        el = &eb->h_list;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+        /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
-                ret = -EROFS;
+         * Any corruption is a code bug. */
-                goto out;
+        BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
-        }
        *new_last_eb = bh;
        get_bh(*new_last_eb);
@@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
        }
        if (last_eb_bh) {
-                status = ocfs2_journal_access(handle, inode, last_eb_bh,
+                status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                goto bail;
        }
+        vfs_dq_free_space_nodirty(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
                                      clusters_to_del;
@@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
                mlog_errno(ret);
        else if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                ret = walk_page_buffers(handle, page_buffers(page),
-                                        from, to, &partial,
-                                        ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        struct page **pages = NULL;
        loff_t end = osb->s_clustersize;
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        has_data = i_size_read(inode) ? 1 : 0;
@@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_inline_to_extents_credits(osb->sb));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
                goto out_unlock;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                unsigned int page_end;
                u64 phys;
+                if (vfs_dq_alloc_space_nodirty(inode,
+                                       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                        ret = -EDQUOT;
+                        goto out_commit;
+                }
+                did_quota = 1;
                ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
                                           &num);
                if (ret) {
@@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        }
 out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(inode,
+                                          ocfs2_clusters_to_bytes(osb->sb, 1));
        ocfs2_commit_trans(osb, handle);
 out_unlock:
@@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
        new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
                                                     i_size_read(inode));
-        path = ocfs2_new_path(fe_bh, &di->id2.i_list);
+        path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+                              ocfs2_journal_access_di);
        if (!path) {
                status = -ENOMEM;
                mlog_errno(status);
@@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
        ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
        if (fe->id2.i_list.l_tree_depth) {
-                status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
+                status = ocfs2_read_extent_block(inode,
-                                          &last_eb_bh);
+                                                 le64_to_cpu(fe->i_last_eb_blk),
+                                                 &last_eb_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                        brelse(last_eb_bh);
-                        status = -EIO;
-                        goto bail;
-                }
        }
        (*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfbe..cceff5c37f47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
 *
 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
 * ocfs2_extent_tree_operations abstract the normal operations we do for
 * the root of extent b-tree.
 */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
        struct ocfs2_extent_tree_operations     *et_ops;
        struct buffer_head                      *et_root_bh;
        struct ocfs2_extent_list                *et_root_el;
+        ocfs2_journal_access_func               et_root_journal_access;
        void                                    *et_object;
        unsigned int                            et_max_leaf_clusters;
 };
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
                                       struct inode *inode,
                                       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
                                        struct inode *inode,
-                                        struct buffer_head *bh,
+                                        struct ocfs2_xattr_value_buf *vb);
-                                        struct ocfs2_xattr_value_root *xv);
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+                            struct buffer_head **bh);
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
                        u32 cpos, u32 len, handle_t *handle,
                        struct ocfs2_alloc_context *meta_ac,
                        struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+                             struct ocfs2_extent_tree *et,
+                             u32 cpos, u32 phys_cpos, u32 len,
+                             struct ocfs2_cached_dealloc_ctxt *dealloc);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
                           struct inode *inode,
                           struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
 */
 struct ocfs2_cached_dealloc_ctxt {
        struct ocfs2_per_slot_free_list         *c_first_suballocator;
+        struct ocfs2_cached_block_free          *c_global_allocator;
 };
 static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 {
        c->c_first_suballocator = NULL;
+        c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+                                u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+        return c->c_global_allocator != NULL;
 }
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
                       struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b33420..a067a6cffb01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-                     fe->i_signature);
-                goto bail;
-        }
        if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
                                                    le32_to_cpu(fe->i_clusters))) {
                mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
        BUG_ON(!PageLocked(page));
        BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
        if (ocfs2_should_order_data(inode)) {
                ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                ret = walk_page_buffers(handle,
-                                        page_buffers(page),
-                                        from, to, NULL,
-                                        ocfs2_journal_dirty_data);
-#endif
                if (ret < 0)
                        mlog_errno(ret);
        }
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
                tmppage = wc->w_pages[i];
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode)) {
+                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                                walk_page_buffers(wc->w_handle,
-                                                  page_buffers(tmppage),
-                                                  from, to, NULL,
-                                                  ocfs2_journal_dirty_data);
-#endif
-                        }
                        block_commit_write(tmppage, from, to);
                }
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                ocfs2_commit_trans(osb, handle);
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        wc->w_handle = handle;
+        if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
        /*
         * We don't want this to fail in ocfs2_write_end(), so do it
         * here.
         */
-        ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        /*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
                                          len);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out_quota;
        }
        if (data_ac)
@@ -1790,6 +1776,10 @@ success:
        *pagep = wc->w_target_page;
        *fsdata = wc;
        return 0;
+out_quota:
+        if (clusters_to_alloc)
+                vfs_dq_free_space(inode,
+                          ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
        ocfs2_commit_trans(osb, handle);
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
                }
                if (page_has_buffers(tmppage)) {
-                        if (ocfs2_should_order_data(inode)) {
+                        if (ocfs2_should_order_data(inode))
                                ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-                                walk_page_buffers(wc->w_handle,
-                                                  page_buffers(tmppage),
-                                                  from, to, NULL,
-                                                  ocfs2_journal_dirty_data);
-#endif
-                        }
                        block_commit_write(tmppage, from, to);
                }
        }
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000000..2a947c44e594
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "blockcheck.h"
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+        unsigned int b, p = 0;
+        /*
+         * Data bits are 0-based, but we're talking code bits, which
+         * are 1-based.
+         */
+        b = i + 1;
+        /* Use the cache if it is there */
+        if (p_cache)
+                p = *p_cache;
+        b += p;
+        /*
+         * For every power of two below our bit number, bump our bit.
+         *
+         * We compare with (b + 1) because we have to compare with what b
+         * would be _if_ it were bumped up by the parity bit.  Capice?
+         *
+         * p is set above.
+         */
+        for (; (1 << p) < (b + 1); p++)
+                b++;
+        if (p_cache)
+                *p_cache = p;
+        return b;
+}
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+        unsigned int i, b, p = 0;
+        BUG_ON(!d);
+        /*
+         * b is the hamming code bit number.  Hamming code specifies a
+         * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+         * for the algorithm.
+         *
+         * The i++ in the for loop is so that the start offset passed
+         * to ocfs2_find_next_bit_set() is one greater than the previously
+         * found bit.
+         */
+        for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+        {
+                /*
+                 * i is the offset in this hunk, nr + i is the total bit
+                 * offset.
+                 */
+                b = calc_code_bit(nr + i, &p);
+                /*
+                 * Data bits in the resultant code are checked by
+                 * parity bits that are part of the bit number
+                 * representation.  Huh?
+                 *
+                 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+                 * In other words, the parity bit at position 2^k
+                 * checks bits in positions having bit k set in
+                 * their binary representation.  Conversely, for
+                 * instance, bit 13, i.e. 1101(2), is checked by
+                 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+                 * </wikipedia>
+                 *
+                 * Note that 'k' is the _code_ bit number.  'b' in
+                 * our loop.
+                 */
+                parity ^= b;
+        }
+        /* While the data buffer was treated as little endian, the
+         * return value is in host endian. */
+        return parity;
+}
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+        return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                       unsigned int fix)
+{
+        unsigned int i, b;
+        BUG_ON(!d);
+        /*
+         * If the bit to fix has an hweight of 1, it's a parity bit.  One
+         * busted parity bit is its own error.  Nothing to do here.
+         */
+        if (hweight32(fix) == 1)
+                return;
+        /*
+         * nr + d is the bit right past the data hunk we're looking at.
+         * If fix after that, nothing to do
+         */
+        if (fix >= calc_code_bit(nr + d, NULL))
+                return;
+        /*
+         * nr is the offset in the data hunk we're starting at.  Let's
+         * start b at the offset in the code buffer.  See hamming_encode()
+         * for a more detailed description of 'b'.
+         */
+        b = calc_code_bit(nr, NULL);
+        /* If the fix is before this hunk, nothing to do */
+        if (fix < b)
+                return;
+        for (i = 0; i < d; i++, b++)
+        {
+                /* Skip past parity bits */
+                while (hweight32(b) == 1)
+                        b++;
+                /*
+                 * i is the offset in this data hunk.
+                 * nr + i is the offset in the total data buffer.
+                 * b is the offset in the total code buffer.
+                 *
+                 * Thus, when b == fix, bit i in the current hunk needs
+                 * fixing.
+                 */
+                if (b == fix)
+                {
+                        if (ocfs2_test_bit(i, data))
+                                ocfs2_clear_bit(i, data);
+                        else
+                                ocfs2_set_bit(i, data);
+                        break;
+                }
+        }
+}
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                             unsigned int fix)
+{
+        ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc)
+{
+        u32 crc;
+        u32 ecc;
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        crc = crc32_le(~0, data, blocksize);
+        ecc = ocfs2_hamming_encode_block(data, blocksize);
+        /*
+         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+         * larger than 16 bits.
+         */
+        BUG_ON(ecc > USHORT_MAX);
+        bc->bc_crc32e = cpu_to_le32(crc);
+        bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        struct ocfs2_block_check check;
+        u32 crc, ecc;
+        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        /* Fast path - if the crc32 validates, we're good to go */
+        crc = crc32_le(~0, data, blocksize);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR,
+             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        /* Ok, try ECC fixups */
+        ecc = ocfs2_hamming_encode_block(data, blocksize);
+        ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+        /* And check the crc32 again */
+        crc = crc32_le(~0, data, blocksize);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        rc = -EIO;
+out:
+        bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+        bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+        return rc;
+}
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc)
+{
+        int i;
+        u32 crc, ecc;
+        BUG_ON(nr < 0);
+        if (!nr)
+                return;
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+                /*
+                 * The number of bits in a buffer is obviously b_size*8.
+                 * The offset of this buffer is b_size*i, so the bit offset
+                 * of this buffer is b_size*8*i.
+                 */
+                ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                                bhs[i]->b_size * 8,
+                                                bhs[i]->b_size * 8 * i);
+        }
+        /*
+         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+         * larger than 16 bits.
+         */
+        BUG_ON(ecc > USHORT_MAX);
+        bc->bc_crc32e = cpu_to_le32(crc);
+        bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc)
+{
+        int i, rc = 0;
+        struct ocfs2_block_check check;
+        u32 crc, ecc, fix;
+        BUG_ON(nr < 0);
+        if (!nr)
+                return 0;
+        check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+        check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+        memset(bc, 0, sizeof(struct ocfs2_block_check));
+        /* Fast path - if the crc32 validates, we're good to go */
+        for (i = 0, crc = ~0; i < nr; i++)
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR,
+             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        /* Ok, try ECC fixups */
+        for (i = 0, ecc = 0; i < nr; i++) {
+                /*
+                 * The number of bits in a buffer is obviously b_size*8.
+                 * The offset of this buffer is b_size*i, so the bit offset
+                 * of this buffer is b_size*8*i.
+                 */
+                ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+                                                bhs[i]->b_size * 8,
+                                                bhs[i]->b_size * 8 * i);
+        }
+        fix = ecc ^ check.bc_ecc;
+        for (i = 0; i < nr; i++) {
+                /*
+                 * Try the fix against each buffer.  It will only affect
+                 * one of them.
+                 */
+                ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+                                  bhs[i]->b_size * 8 * i, fix);
+        }
+        /* And check the crc32 again */
+        for (i = 0, crc = ~0; i < nr; i++)
+                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+        if (crc == check.bc_crc32e)
+                goto out;
+        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+             (unsigned int)check.bc_crc32e, (unsigned int)crc);
+        rc = -EIO;
+out:
+        bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+        bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+        return rc;
+}
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc)
+{
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+        return rc;
+}
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc)
+{
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc)
+{
+        int rc = 0;
+        if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+                rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+        return rc;
+}
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000000..70ec3feda32f
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+                            struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+                                struct buffer_head **bhs, int nr,
+                                struct ocfs2_block_check *bc);
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+                               struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+                                   struct ocfs2_block_check *bc);
+/*
+ * Hamming code functions
+ */
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+                         unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+                       unsigned int fix);
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+                                    unsigned int fix);
+#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7c..15c8e6deee2e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
 #include "buffer_head_io.h"
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+        BH_NeedsValidate = BH_JBDPrivateStart,
+};
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                      struct inode *inode)
 {
@@ -166,7 +178,9 @@ bail:
 }
 int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
-                      struct buffer_head *bhs[], int flags)
+                      struct buffer_head *bhs[], int flags,
+                      int (*validate)(struct super_block *sb,
+                                      struct buffer_head *bh))
 {
        int status = 0;
        int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
                        clear_buffer_uptodate(bh);
                        get_bh(bh); /* for end_buffer_read_sync() */
+                        if (validate)
+                                set_buffer_needs_validate(bh);
                        bh->b_end_io = end_buffer_read_sync;
                        submit_bh(READ, bh);
                        continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
                                bhs[i] = NULL;
                                continue;
                        }
+                        if (buffer_needs_validate(bh)) {
+                                /* We never set NeedsValidate if the
+                                 * buffer was held by the journal, so
+                                 * that better not have changed */
+                                BUG_ON(buffer_jbd(bh));
+                                clear_buffer_needs_validate(bh);
+                                status = validate(inode->i_sb, bh);
+                                if (status) {
+                                        put_bh(bh);
+                                        bhs[i] = NULL;
+                                        continue;
+                                }
+                        }
                }
                /* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade7..c75d682dadd8 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
                             int uptodate);
-static inline int ocfs2_read_block(struct inode        *inode,
-                                   u64                  off,
-                                   struct buffer_head **bh);
 int ocfs2_write_block(struct ocfs2_super          *osb,
                      struct buffer_head  *bh,
                      struct inode        *inode);
-int ocfs2_read_blocks(struct inode        *inode,
-                      u64                  block,
-                      int                  nr,
-                      struct buffer_head  *bhs[],
-                      int                  flags);
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
                           unsigned int nr, struct buffer_head *bhs[]);
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+                      struct buffer_head *bhs[], int flags,
+                      int (*validate)(struct super_block *sb,
+                                      struct buffer_head *bh));
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
                                struct buffer_head *bh);
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_READAHEAD         8
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
-                                   struct buffer_head **bh)
+                                   struct buffer_head **bh,
+                                   int (*validate)(struct super_block *sb,
+                                                   struct buffer_head *bh))
 {
        int status = 0;
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
                goto bail;
        }
-        status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+        status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
 bail:
        return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c03..04697ba7f73e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
        while (!kthread_should_stop() && !reg->hr_unclean_stop) {
                /* We track the time spent inside
-                 * o2hb_do_disk_heartbeat so that we avoid more then
+                 * o2hb_do_disk_heartbeat so that we avoid more than
                 * hr_timeout_ms between disk writes. On busy systems
                 * this should result in a heartbeat which is less
                 * likely to time itself out. */
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef6..96df5416993e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
        define_mask(QUORUM),
        define_mask(EXPORT),
        define_mask(XATTR),
+        define_mask(QUOTA),
        define_mask(ERROR),
        define_mask(NOTICE),
        define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c680471..7e72a81bc2d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
 #define ML_QUORUM       0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT       0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR        0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA        0x0000000040000000ULL /* ocfs2 quota operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR        0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE       0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb85187..f2c4098cf337 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -47,6 +48,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct ocfs2_alloc_context *meta_ac,
                               struct buffer_head **new_bh);
-static struct buffer_head *ocfs2_bread(struct inode *inode,
+/*
-                                       int block, int *err, int reada)
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
 {
-        struct buffer_head *bh = NULL;
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-        int tmperr;
+                return 0;
-        u64 p_blkno;
-        int readflags = 0;
-        if (reada)
+        return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
-                readflags |= OCFS2_BH_READAHEAD;
+}
-        if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
-            i_size_read(inode)) {
+{
-                BUG_ON(!reada);
+        return ocfs2_meta_ecc(osb);
-                return NULL;
+}
-        }
-        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
-        tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+{
-                                             NULL);
+        return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
-        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+}
-        if (tmperr < 0) {
-                mlog_errno(tmperr);
-                goto fail;
-        }
-        tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
-        if (tmperr < 0)
-                goto fail;
-        tmperr = 0;
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+                                                            void *data)
+{
+        char *p = data;
-        *err = 0;
+        p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
-        return bh;
+        return (struct ocfs2_dir_block_trailer *)p;
+}
-fail:
+/*
-        brelse(bh);
+ * XXX: This is executed once on every dirent. We should consider optimizing
-        bh = NULL;
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+                                  struct ocfs2_dir_entry *de,
+                                  unsigned long offset,
+                                  unsigned long blklen)
+{
+        unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
-        *err = -EIO;
+        if (!ocfs2_dir_has_trailer(dir))
-        return NULL;
+                return 0;
+        if (offset != toff)
+                return 0;
+        return 1;
+}
+static void ocfs2_init_dir_trailer(struct inode *inode,
+                                   struct buffer_head *bh)
+{
+        struct ocfs2_dir_block_trailer *trailer;
+        trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+        strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+        trailer->db_compat_rec_len =
+                        cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+        trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+        trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
 }
 /*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -250,6 +277,108 @@ out:
        return NULL;
 }
+static int ocfs2_validate_dir_block(struct super_block *sb,
+                                    struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_trailer_from_bh(bh, sb);
+        /*
+         * We don't validate dirents here, that's handled
+         * in-place when the code walks them.
+         */
+        mlog(0, "Validating dirblock %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         *
+         * Note that we are safe to call this even if the directory
+         * doesn't have a trailer.  Filesystems without metaecc will do
+         * nothing, and filesystems with it will have one.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+        if (rc)
+                mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+        return rc;
+}
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+                                struct buffer_head **bh, int flags)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        struct ocfs2_dir_block_trailer *trailer;
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+                                    ocfs2_validate_dir_block);
+        if (rc) {
+                mlog_errno(rc);
+                goto out;
+        }
+        /*
+         * We check the trailer here rather than in
+         * ocfs2_validate_dir_block() because that function doesn't have
+         * the inode to test.
+         */
+        if (!(flags & OCFS2_BH_READAHEAD) &&
+            ocfs2_dir_has_trailer(inode)) {
+                trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+                if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Invalid dirblock #%llu: "
+                                    "signature = %.*s\n",
+                                    (unsigned long long)tmp->b_blocknr, 7,
+                                    trailer->db_signature);
+                        goto out;
+                }
+                if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Directory block #%llu has an invalid "
+                                    "db_blkno of %llu",
+                                    (unsigned long long)tmp->b_blocknr,
+                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                        goto out;
+                }
+                if (le64_to_cpu(trailer->db_parent_dinode) !=
+                    OCFS2_I(inode)->ip_blkno) {
+                        rc = -EINVAL;
+                        ocfs2_error(inode->i_sb,
+                                    "Directory block #%llu on dinode "
+                                    "#%llu has an invalid parent_dinode "
+                                    "of %llu",
+                                    (unsigned long long)tmp->b_blocknr,
+                                    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                                    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+                        goto out;
+                }
+        }
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!*bh)
+                *bh = tmp;
+out:
+        return rc ? -EIO : 0;
+}
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
                                               struct inode *dir,
                                               struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
                                }
                                num++;
-                                bh = ocfs2_bread(dir, b++, &err, 1);
+                                bh = NULL;
+                                err = ocfs2_read_dir_block(dir, b++, &bh,
+                                                           OCFS2_BH_READAHEAD);
                                bh_use[ra_max] = bh;
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
                        goto next;
-                if (ocfs2_read_block(dir, block, &bh)) {
+                if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
                        /* read error, skip block & hope for the best.
-                         * ocfs2_read_block() has released the bh. */
+                         * ocfs2_read_dir_block() has released the bh. */
                        ocfs2_error(dir->i_sb, "reading directory %llu, "
                                    "offset %lu\n",
                                    (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
                       struct inode *new_entry_inode)
 {
        int ret;
+        ocfs2_journal_access_func access = ocfs2_journal_access_db;
        /*
         * The same code works fine for both inline-data and extent
-         * based directories, so no need to split this up.
+         * based directories, so no need to split this up.  The only
+         * difference is the journal_access function.
         */
-        ret = ocfs2_journal_access(handle, dir, de_bh,
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                access = ocfs2_journal_access_di;
+        ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 {
        struct ocfs2_dir_entry *de, *pde;
        int i, status = -ENOENT;
+        ocfs2_journal_access_func access = ocfs2_journal_access_db;
        mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+        if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+                access = ocfs2_journal_access_di;
        i = 0;
        pde = NULL;
        de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                        goto bail;
                }
                if (de == de_del)  {
-                        status = ocfs2_journal_access(handle, dir, bh,
+                        status = access(handle, dir, bh,
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                        OCFS2_JOURNAL_ACCESS_WRITE);
                        if (status < 0) {
                                status = -EIO;
                                mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
        struct ocfs2_dinode *di;
        struct ocfs2_inline_data *data;
-        ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(dir, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
                        goto bail;
                }
+                /* We're guaranteed that we should have space, so we
+                 * can't possibly have hit the trailer...right? */
+                mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+                                "Hit dir trailer trying to insert %.*s "
+                                "(namelen %d) into directory %llu.  "
+                                "offset is %lu, trailer offset is %d\n",
+                                namelen, name, namelen,
+                                (unsigned long long)parent_fe_bh->b_blocknr,
+                                offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
                        retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
                                goto bail;
                        }
-                        status = ocfs2_journal_access(handle, dir, insert_bh,
+                        if (insert_bh == parent_fe_bh)
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                status = ocfs2_journal_access_di(handle, dir,
+                                                                 insert_bh,
+                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                        else
+                                status = ocfs2_journal_access_db(handle, dir,
+                                                                 insert_bh,
+                                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                        /* By now the buffer is marked for journaling */
                        offset += le16_to_cpu(de->rec_len);
                        if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
                        retval = 0;
                        goto bail;
                }
                offset += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
        }
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
        struct ocfs2_inline_data *data;
        struct ocfs2_dir_entry *de;
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
        int i, stored;
        struct buffer_head * bh, * tmp;
        struct ocfs2_dir_entry * de;
-        int err;
        struct super_block * sb = inode->i_sb;
        unsigned int ra_sectors = 16;
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
        while (!error && !stored && *f_pos < i_size_read(inode)) {
                blk = (*f_pos) >> sb->s_blocksize_bits;
-                bh = ocfs2_bread(inode, blk, &err, 0);
+                if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
-                if (!bh) {
+                        /* Skip the corrupt dirblock and keep trying */
-                        mlog(ML_ERROR,
-                             "directory #%llu contains a hole at offset %lld\n",
-                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                             *f_pos);
                        *f_pos += sb->s_blocksize - offset;
                        continue;
                }
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
                    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
                        for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
                             i > 0; i--) {
-                                tmp = ocfs2_bread(inode, ++blk, &err, 1);
+                                tmp = NULL;
-                                brelse(tmp);
+                                if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+                                                          OCFS2_BH_READAHEAD))
+                                        brelse(tmp);
                        }
                        last_ra_blk = blk;
                        ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
                }
                offset = 0;
                brelse(bh);
+                bh = NULL;
        }
        stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
        return !priv.seen_other;
 }
-static void ocfs2_fill_initial_dirents(struct inode *inode,
+/*
-                                       struct inode *parent,
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
-                                       char *start, unsigned int size)
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+                                                          struct inode *parent,
+                                                          char *start,
+                                                          unsigned int size)
 {
        struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
        de->name_len = 2;
        strcpy(de->name, "..");
        ocfs2_set_de_type(de, S_IFDIR);
+        return de;
 }
 /*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
        struct ocfs2_inline_data *data = &di->id2.i_data;
        unsigned int size = le16_to_cpu(data->id_count);
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
                                 struct ocfs2_alloc_context *data_ac)
 {
        int status;
+        unsigned int size = osb->sb->s_blocksize;
        struct buffer_head *new_bh = NULL;
+        struct ocfs2_dir_entry *de;
        mlog_entry_void();
+        if (ocfs2_supports_dir_trailer(osb))
+                size = ocfs2_dir_trailer_blk_off(parent->i_sb);
        status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
                                     data_ac, NULL, &new_bh);
        if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
        ocfs2_set_new_buffer_uptodate(inode, new_bh);
-        status = ocfs2_journal_access(handle, inode, new_bh,
+        status = ocfs2_journal_access_db(handle, inode, new_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        memset(new_bh->b_data, 0, osb->sb->s_blocksize);
-        ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
+        de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
-                                   osb->sb->s_blocksize);
+        if (ocfs2_supports_dir_trailer(osb))
+                ocfs2_init_dir_trailer(inode, new_bh);
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                                     data_ac);
 }
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
 static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-                                     unsigned int new_size)
+                                     struct super_block *sb)
 {
        struct ocfs2_dir_entry *de;
        struct ocfs2_dir_entry *prev_de;
        char *de_buf, *limit;
-        unsigned int bytes = new_size - old_size;
+        unsigned int new_size = sb->s_blocksize;
+        unsigned int bytes;
+        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+                new_size = ocfs2_dir_trailer_blk_off(sb);
+        bytes = new_size - old_size;
        limit = start + old_size;
        de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int blocks_wanted,
                                   struct buffer_head **first_block_bh)
 {
-        int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
        u32 alloc, bit_off, len;
        struct super_block *sb = dir->i_sb;
+        int ret, credits = ocfs2_inline_to_extents_credits(sb);
        u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        handle_t *handle;
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
                goto out_sem;
        }
+        if (vfs_dq_alloc_space_nodirty(dir,
+                                ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+                ret = -EDQUOT;
+                goto out_commit;
+        }
+        did_quota = 1;
        /*
         * Try to claim as many clusters as the bitmap can give though
         * if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
-        ret = ocfs2_journal_access(handle, dir, dirdata_bh,
+        ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
        memset(dirdata_bh->b_data + i_size_read(dir), 0,
               sb->s_blocksize - i_size_read(dir));
-        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
+        ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
-                                 sb->s_blocksize);
+        if (ocfs2_supports_dir_trailer(osb))
+                ocfs2_init_dir_trailer(dir, dirdata_bh);
        ret = ocfs2_journal_dirty(handle, dirdata_bh);
        if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
         * We let the later dirent insert modify c/mtime - to the user
         * the data hasn't changed.
         */
-        ret = ocfs2_journal_access(handle, dir, di_bh,
+        ret = ocfs2_journal_access_di(handle, dir, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
+                                      OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
        dirdata_bh = NULL;
 out_commit:
+        if (ret < 0 && did_quota)
+                vfs_dq_free_space_nodirty(dir,
+                        ocfs2_clusters_to_bytes(osb->sb, 2));
        ocfs2_commit_trans(osb, handle);
 out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
                               struct buffer_head **new_bh)
 {
        int status;
-        int extend;
+        int extend, did_quota = 0;
        u64 p_blkno, v_blkno;
        spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        if (extend) {
                u32 offset = OCFS2_I(dir)->ip_clusters;
+                if (vfs_dq_alloc_space_nodirty(dir,
+                                        ocfs2_clusters_to_bytes(sb, 1))) {
+                        status = -EDQUOT;
+                        goto bail;
+                }
+                did_quota = 1;
                status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
                                              1, 0, parent_fe_bh, handle,
                                              data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
        }
        status = 0;
 bail:
+        if (did_quota && status < 0)
+                vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
        mlog_exit(status);
        return status;
 }
@@ -1569,16 +1771,22 @@ do_extend:
        ocfs2_set_new_buffer_uptodate(dir, new_bh);
-        status = ocfs2_journal_access(handle, dir, new_bh,
+        status = ocfs2_journal_access_db(handle, dir, new_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        memset(new_bh->b_data, 0, sb->s_blocksize);
        de = (struct ocfs2_dir_entry *) new_bh->b_data;
        de->inode = 0;
-        de->rec_len = cpu_to_le16(sb->s_blocksize);
+        if (ocfs2_dir_has_trailer(dir)) {
+                de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+                ocfs2_init_dir_trailer(dir, new_bh);
+        } else {
+                de->rec_len = cpu_to_le16(sb->s_blocksize);
+        }
        status = ocfs2_journal_dirty(handle, new_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
                                   unsigned int *blocks_wanted)
 {
        int ret;
+        struct super_block *sb = dir->i_sb;
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_dir_entry *de, *last_de = NULL;
        char *de_buf, *limit;
        unsigned long offset = 0;
-        unsigned int rec_len, new_rec_len;
+        unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+        /*
+         * This calculates how many free bytes we'd have in block zero, should
+         * this function force expansion to an extent tree.
+         */
+        if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+                free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+        else
+                free_space = dir->i_sb->s_blocksize - i_size_read(dir);
        de_buf = di->id2.i_data.id_data;
        limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
                        ret = -EEXIST;
                        goto out;
                }
+                /*
+                 * No need to check for a trailing dirent record here as
+                 * they're not used for inline dirs.
+                 */
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        /* Ok, we found a spot. Return this bh and let
                         * the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
         * dirent can be found.
         */
        *blocks_wanted = 1;
-        new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+        new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
        if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
                *blocks_wanted = 2;
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
        struct ocfs2_dir_entry *de;
        struct super_block *sb = dir->i_sb;
        int status;
+        int blocksize = dir->i_sb->s_blocksize;
-        bh = ocfs2_bread(dir, 0, &status, 0);
+        status = ocfs2_read_dir_block(dir, 0, &bh, 0);
-        if (!bh) {
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                                status = -ENOSPC;
                                goto bail;
                        }
-                        bh = ocfs2_bread(dir,
+                        status = ocfs2_read_dir_block(dir,
-                                         offset >> sb->s_blocksize_bits,
+                                             offset >> sb->s_blocksize_bits,
-                                         &status,
+                                             &bh, 0);
-                                         0);
+                        if (status) {
-                        if (!bh) {
                                mlog_errno(status);
                                goto bail;
                        }
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = -EEXIST;
                        goto bail;
                }
+                if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+                                           blocksize))
+                        goto next;
                if (ocfs2_dirent_would_fit(de, rec_len)) {
                        /* Ok, we found a spot. Return this bh and let
                         * the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = 0;
                        goto bail;
                }
+next:
                offset += le16_to_cpu(de->rec_len);
                de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
        }
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d87..c511e2e18e9f 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
                       struct buffer_head *fe_bh,
                       struct ocfs2_alloc_context *data_ac);
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+                                                            void *data);
 #endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8ba..d07ddbe4b283 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        struct list_head *iter, *head=NULL;
        u64 cookie;
        u32 flags;
+        u8 node;
        if (!dlm_grab(dlm)) {
                dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        name = past->name;
        locklen = past->namelen;
-        cookie = be64_to_cpu(past->cookie);
+        cookie = past->cookie;
        flags = be32_to_cpu(past->flags);
+        node = past->node_idx;
        if (locklen > DLM_LOCKID_NAME_MAX) {
                ret = DLM_IVBUFLEN;
-                mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+                mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+                     "handler!\n", locklen);
                goto leave;
        }
        if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
             (LKM_PUT_LVB|LKM_GET_LVB)) {
-                mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+                mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+                     flags);
                ret = DLM_BADARGS;
                goto leave;
        }
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        if (past->type != DLM_AST &&
            past->type != DLM_BAST) {
                mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
-                     "name=%.*s\n", past->type, 
+                     "name=%.*s, node=%u\n", past->type,
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-                     locklen, name);
+                     locklen, name, node);
                ret = DLM_IVLOCKID;
                goto leave;
        }
        res = dlm_lookup_lockres(dlm, name, locklen);
        if (!res) {
-                mlog(0, "got %sast for unknown lockres! "
+                mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
-                     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
+                     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
-                     past->type == DLM_AST ? "" : "b",
+                     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_node(cookie),
+                     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-                     dlm_get_lock_cookie_seq(cookie),
+                     locklen, name, node);
-                     locklen, name, locklen);
                ret = DLM_IVLOCKID;
                goto leave;
        }
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_RECOVERING) {
-                mlog(0, "responding with DLM_RECOVERING!\n");
+                mlog(0, "Responding with DLM_RECOVERING!\n");
                ret = DLM_RECOVERING;
                goto unlock_out;
        }
        if (res->state & DLM_LOCK_RES_MIGRATING) {
-                mlog(0, "responding with DLM_MIGRATING!\n");
+                mlog(0, "Responding with DLM_MIGRATING!\n");
                ret = DLM_MIGRATING;
                goto unlock_out;
        }
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        lock = NULL;
        list_for_each(iter, head) {
                lock = list_entry (iter, struct dlm_lock, list);
-                if (be64_to_cpu(lock->ml.cookie) == cookie)
+                if (lock->ml.cookie == cookie)
                        goto do_ast;
        }
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        list_for_each(iter, head) {
                lock = list_entry (iter, struct dlm_lock, list);
-                if (be64_to_cpu(lock->ml.cookie) == cookie)
+                if (lock->ml.cookie == cookie)
                        goto do_ast;
        }
-        mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
+        mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
-             "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
+             "node=%u\n", past->type == DLM_AST ? "" : "b",
-             dlm_get_lock_cookie_node(cookie),
+             dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
-             dlm_get_lock_cookie_seq(cookie),
+             dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
-             locklen, name, locklen);
+             locklen, name, node);
        ret = DLM_NORMAL;
 unlock_out:
@@ -383,8 +386,8 @@ do_ast:
        if (past->type == DLM_AST) {
                /* do not alter lock refcount.  switching lists. */
                list_move_tail(&lock->list, &res->granted);
-                mlog(0, "ast: adding to granted list... type=%d, "
+                mlog(0, "ast: Adding to granted list... type=%d, "
-                          "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+                     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
                if (lock->ml.convert_type != LKM_IVMODE) {
                        lock->ml.type = lock->ml.convert_type;
                        lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
                dlm_do_local_bast(dlm, res, lock, past->blocked_type);
 leave:
        if (res)
                dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a49..bb53714813ab 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
        unsigned int purge_count;
        spinlock_t spinlock;
        spinlock_t ast_lock;
+        spinlock_t track_lock;
        char *name;
        u8 node_num;
        u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
         * put on a list for the dlm thread to run. */
        unsigned long    last_used;
+        struct dlm_ctxt *dlm;
        unsigned migration_pending:1;
        atomic_t asts_reserved;
        spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175d..b32f60a5acfb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
 {
        struct debug_lockres *dl = m->private;
        struct dlm_ctxt *dlm = dl->dl_ctxt;
+        struct dlm_lock_resource *oldres = dl->dl_res;
        struct dlm_lock_resource *res = NULL;
+        struct list_head *track_list;
-        spin_lock(&dlm->spinlock);
+        spin_lock(&dlm->track_lock);
+        if (oldres)
+                track_list = &oldres->tracking;
+        else
+                track_list = &dlm->tracking_list;
-        if (dl->dl_res) {
+        list_for_each_entry(res, track_list, tracking) {
-                list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
+                if (&res->tracking == &dlm->tracking_list)
-                        if (dl->dl_res) {
+                        res = NULL;
-                                dlm_lockres_put(dl->dl_res);
+                else
-                                dl->dl_res = NULL;
-                        }
-                        if (&res->tracking == &dlm->tracking_list) {
-                                mlog(0, "End of list found, %p\n", res);
-                                dl = NULL;
-                                break;
-                        }
                        dlm_lockres_get(res);
-                        dl->dl_res = res;
+                break;
-                        break;
-                }
-        } else {
-                if (!list_empty(&dlm->tracking_list)) {
-                        list_for_each_entry(res, &dlm->tracking_list, tracking)
-                                break;
-                        dlm_lockres_get(res);
-                        dl->dl_res = res;
-                } else
-                        dl = NULL;
        }
+        spin_unlock(&dlm->track_lock);
-        if (dl) {
+        if (oldres)
-                spin_lock(&dl->dl_res->spinlock);
+                dlm_lockres_put(oldres);
-                dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
-                spin_unlock(&dl->dl_res->spinlock);
-        }
-        spin_unlock(&dlm->spinlock);
+        dl->dl_res = res;
+        if (res) {
+                spin_lock(&res->spinlock);
+                dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+                spin_unlock(&res->spinlock);
+        } else
+                dl = NULL;
+        /* passed to seq_show */
        return dl;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e8..d8d578f45613 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
        spin_lock_init(&dlm->spinlock);
        spin_lock_init(&dlm->master_lock);
        spin_lock_init(&dlm->ast_lock);
+        spin_lock_init(&dlm->track_lock);
        INIT_LIST_HEAD(&dlm->list);
        INIT_LIST_HEAD(&dlm->dirty_list);
        INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d54020..1c9efb406a96 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
-                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-        inode->i_blocks = 0;
        inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf3683..54e182a27caf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
 static void dlm_lockres_release(struct kref *kref)
 {
        struct dlm_lock_resource *res;
+        struct dlm_ctxt *dlm;
        res = container_of(kref, struct dlm_lock_resource, refs);
+        dlm = res->dlm;
        /* This should not happen -- all lockres' have a name
         * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
        mlog(0, "destroying lockres %.*s\n", res->lockname.len,
             res->lockname.name);
+        spin_lock(&dlm->track_lock);
        if (!list_empty(&res->tracking))
                list_del_init(&res->tracking);
        else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
                     res->lockname.len, res->lockname.name);
                dlm_print_one_lock_resource(res);
        }
+        spin_unlock(&dlm->track_lock);
+        dlm_put(dlm);
        if (!hlist_unhashed(&res->hash_node) ||
            !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->migration_pending = 0;
        res->inflight_locks = 0;
+        /* put in dlm_lockres_release */
+        dlm_grab(dlm);
+        res->dlm = dlm;
        kref_init(&res->refs);
        /* just for consistency */
@@ -722,14 +732,21 @@ lookup:
        if (tmpres) {
                int dropping_ref = 0;
+                spin_unlock(&dlm->spinlock);
                spin_lock(&tmpres->spinlock);
+                /* We wait for the other thread that is mastering the resource */
+                if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+                        __dlm_wait_on_lockres(tmpres);
+                        BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+                }
                if (tmpres->owner == dlm->node_num) {
                        BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
                        dlm_lockres_grab_inflight_ref(dlm, tmpres);
                } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
                        dropping_ref = 1;
                spin_unlock(&tmpres->spinlock);
-                spin_unlock(&dlm->spinlock);
                /* wait until done messaging the master, drop our ref to allow
                 * the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                  struct dlm_node_iter *iter)
 {
        struct dlm_migrate_request migrate;
-        int ret, status = 0;
+        int ret, skip, status = 0;
        int nodenum;
        memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                    nodenum == new_master)
                        continue;
+                /* We could race exit domain. If exited, skip. */
+                spin_lock(&dlm->spinlock);
+                skip = (!test_bit(nodenum, dlm->domain_map));
+                spin_unlock(&dlm->spinlock);
+                if (skip) {
+                        clear_bit(nodenum, iter->node_map);
+                        continue;
+                }
                ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
                                         &migrate, sizeof(migrate), nodenum,
                                         &status);
-                if (ret < 0)
+                if (ret < 0) {
-                        mlog_errno(ret);
+                        mlog(0, "migrate_request returned %d!\n", ret);
-                else if (status < 0) {
+                        if (!dlm_is_host_down(ret)) {
+                                mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+                                BUG();
+                        }
+                        clear_bit(nodenum, iter->node_map);
+                        ret = 0;
+                } else if (status < 0) {
                        mlog(0, "migrate request (node %u) returned %d!\n",
                             nodenum, status);
                        ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d1295203029f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
                spin_lock(&res->spinlock);
                /* This ensures that clear refmap is sent after the set */
-                __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+                __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+                                                  DLM_LOCK_RES_MIGRATING));
                spin_unlock(&res->spinlock);
                /* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f7..b0c4cadd4c45 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
 /*
 * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
                                     struct ocfs2_lock_res *lockres);
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
                                     unsigned int line,
                                     struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb =
+        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        mlog(level, "LVB information for %s (called from %s:%u):\n",
             lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
        .flags          = 0,
 };
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+        .set_lvb        = ocfs2_set_qinfo_lvb,
+        .get_osb        = ocfs2_get_qinfo_osb,
+        .flags          = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
        return (struct ocfs2_dentry_lock *)lockres->l_priv;
 }
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+        BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+        return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
 static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 {
        if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
        return OCFS2_SB(inode->i_sb);
 }
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+        return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
 {
        struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
        lockres->l_flags |= OCFS2_LOCK_NOCACHE;
 }
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info)
+{
+        ocfs2_lock_res_init_once(lockres);
+        ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+                              0, lockres->l_name);
+        ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+                                   OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+                                   info);
+}
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
        mlog_entry_void();
@@ -1290,7 +1324,7 @@ again:
                        goto out;
                }
-                mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
+                mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
                     lockres->l_name);
                /* At this point we've gone inside the dlm and need to
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
        mlog_entry_void();
-        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        /*
         * Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        mlog_meta_lvb(0, lockres);
-        lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
        /* We're safe here without the lockres lock... */
        spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
                                              struct ocfs2_lock_res *lockres)
 {
-        struct ocfs2_meta_lvb *lvb =
+        struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-                (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
        if (lvb->lvb_version == OCFS2_LVB_VERSION
            && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
        } else {
                /* Boo, we have to go to disk. */
                /* read bh, cast, ocfs2_refresh_inode */
-                status = ocfs2_read_block(inode, oi->ip_blkno, bh);
+                status = ocfs2_read_inode_block(inode, bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
                fe = (struct ocfs2_dinode *) (*bh)->b_data;
                /* This is a good chance to make sure we're not
-                 * locking an invalid object.
+                 * locking an invalid object.  ocfs2_read_inode_block()
+                 * already checked that the inode block is sane.
                 *
                 * We bug on a stale inode here because we checked
                 * above whether it was wiped from disk. The wiping
                 * node provides a guarantee that we receive that
                 * message and can mark the inode before dropping any
                 * locks associated with it. */
-                if (!OCFS2_IS_VALID_DINODE(fe)) {
-                        OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                        status = -EIO;
-                        goto bail_refresh;
-                }
                mlog_bug_on_msg(inode->i_generation !=
                                le32_to_cpu(fe->i_generation),
                                "Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
                return 0;
        }
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
+        status = ocfs2_read_inode_block(inode, ret_bh);
        if (status < 0)
                mlog_errno(status);
@@ -2922,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
                ocfs2_dlm_dump_lksb(&lockres->l_lksb);
                BUG();
        }
-        mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
+        mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
             lockres->l_name);
        ocfs2_wait_on_busy_lock(lockres);
@@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
        return UNBLOCK_CONTINUE_POST;
 }
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+        struct ocfs2_qinfo_lvb *lvb;
+        struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+        struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+                                            oinfo->dqi_gi.dqi_type);
+        mlog_entry_void();
+        lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+        lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+        lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+        lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+        lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+        lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+        lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+        mlog_exit_void();
+}
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+        mlog_entry_void();
+        if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+                ocfs2_cluster_unlock(osb, lockres, level);
+        mlog_exit_void();
+}
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+        struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+                                            oinfo->dqi_gi.dqi_type);
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+        struct buffer_head *bh = NULL;
+        struct ocfs2_global_disk_dqinfo *gdinfo;
+        int status = 0;
+        if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+                info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+                info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+                oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+                oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+                oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+                oinfo->dqi_gi.dqi_free_entry =
+                                        be32_to_cpu(lvb->lvb_free_entry);
+        } else {
+                status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+                gdinfo = (struct ocfs2_global_disk_dqinfo *)
+                                        (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+                info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+                info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+                oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+                oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+                oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+                oinfo->dqi_gi.dqi_free_entry =
+                                        le32_to_cpu(gdinfo->dqi_free_entry);
+                brelse(bh);
+                ocfs2_track_lock_refresh(lockres);
+        }
+bail:
+        return status;
+}
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+        struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+        int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+        int status = 0;
+        mlog_entry_void();
+        /* On RO devices, locking really isn't needed... */
+        if (ocfs2_is_hard_readonly(osb)) {
+                if (ex)
+                        status = -EROFS;
+                goto bail;
+        }
+        if (ocfs2_mount_local(osb))
+                goto bail;
+        status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        if (!ocfs2_should_refresh_lock_res(lockres))
+                goto bail;
+        /* OK, we have the lock but we need to refresh the quota info */
+        status = ocfs2_refresh_qinfo(oinfo);
+        if (status)
+                ocfs2_qinfo_unlock(oinfo, ex);
+        ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+        mlog_exit(status);
+        return status;
+}
 /*
 * This is the filesystem locking protocol.  It provides the lock handling
 * hooks for the underlying DLM.  It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b1..3f8d9986b8e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
        __be32       lvb_reserved2;
 };
+#define OCFS2_QINFO_LVB_VERSION 1
+struct ocfs2_qinfo_lvb {
+        __u8    lvb_version;
+        __u8    lvb_reserved[3];
+        __be32  lvb_bgrace;
+        __be32  lvb_igrace;
+        __be32  lvb_syncms;
+        __be32  lvb_blocks;
+        __be32  lvb_free_blk;
+        __be32  lvb_free_entry;
+};
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY        (0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
 struct ocfs2_file_private;
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
                              struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
 void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac58234..f2bb1a04d253 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
-        ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+        ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
        eb = (struct ocfs2_extent_block *) eb_bh->b_data;
        el = &eb->h_list;
-        if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-                ret = -EROFS;
-                OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-                goto out;
-        }
        if (el->l_tree_depth) {
                ocfs2_error(inode->i_sb,
                            "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
                if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
                        goto no_more_extents;
-                ret = ocfs2_read_block(inode,
+                ret = ocfs2_read_extent_block(inode,
-                                       le64_to_cpu(eb->h_next_leaf_blk),
+                                              le64_to_cpu(eb->h_next_leaf_blk),
-                                       &next_eb_bh);
+                                              &next_eb_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-                if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-                        ret = -EROFS;
-                        OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-                        goto out;
-                }
+                next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
                el = &next_eb->h_list;
                i = ocfs2_search_for_hole_index(el, v_cluster);
        }
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
        if (ret == 0)
                goto out;
-        ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+        ret = ocfs2_read_inode_block(inode, &di_bh);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -819,3 +806,74 @@ out:
        return ret;
 }
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+                           struct buffer_head *bhs[], int flags,
+                           int (*validate)(struct super_block *sb,
+                                           struct buffer_head *bh))
+{
+        int rc = 0;
+        u64 p_block, p_count;
+        int i, count, done = 0;
+        mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+                   "flags = %x, validate = %p)\n",
+                   inode, (unsigned long long)v_block, nr, bhs, flags,
+                   validate);
+        if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+            i_size_read(inode)) {
+                BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+                goto out;
+        }
+        while (done < nr) {
+                down_read(&OCFS2_I(inode)->ip_alloc_sem);
+                rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+                                                 &p_block, &p_count, NULL);
+                up_read(&OCFS2_I(inode)->ip_alloc_sem);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+                if (!p_block) {
+                        rc = -EIO;
+                        mlog(ML_ERROR,
+                             "Inode #%llu contains a hole at offset %llu\n",
+                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                             (unsigned long long)(v_block + done) <<
+                             inode->i_sb->s_blocksize_bits);
+                        break;
+                }
+                count = nr - done;
+                if (p_count < count)
+                        count = p_count;
+                /*
+                 * If the caller passed us bhs, they should have come
+                 * from a previous readahead call to this function.  Thus,
+                 * they should have the right b_blocknr.
+                 */
+                for (i = 0; i < count; i++) {
+                        if (!bhs[done + i])
+                                continue;
+                        BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+                }
+                rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
+                                       flags, validate);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+                done += count;
+        }
+out:
+        mlog_exit(rc);
+        return rc;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f34..b7dd9731b462 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
                             u32 *p_cluster, u32 *num_clusters,
                             struct ocfs2_extent_list *el);
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+                           struct buffer_head *bhs[], int flags,
+                           int (*validate)(struct super_block *sb,
+                                           struct buffer_head *bh));
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+                                        struct buffer_head **bh,
+                                        int (*validate)(struct super_block *sb,
+                                                        struct buffer_head *bh))
+{
+        int status = 0;
+        if (bh == NULL) {
+                printk("ocfs2: bh == NULL\n");
+                status = -EINVAL;
+                goto bail;
+        }
+        status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+bail:
+        return status;
+}
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b2..a5887df2cd8a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -56,6 +57,8 @@
 #include "suballoc.h"
 #include "super.h"
 #include "xattr.h"
+#include "acl.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_journal_access_di(handle, inode, bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -303,9 +306,9 @@ bail:
        return status;
 }
-static int ocfs2_simple_size_update(struct inode *inode,
+int ocfs2_simple_size_update(struct inode *inode,
-                                    struct buffer_head *di_bh,
+                             struct buffer_head *di_bh,
-                                    u64 new_i_size)
+                             u64 new_i_size)
 {
        int ret;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
                goto out;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
                   (unsigned long long)OCFS2_I(inode)->ip_blkno,
                   (unsigned long long)new_i_size);
+        /* We trust di_bh because it comes from ocfs2_inode_lock(), which
+         * already validated it */
        fe = (struct ocfs2_dinode *) di_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto bail;
-        }
        mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
                        "Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        enum ocfs2_alloc_restarted why;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
+        int did_quota = 0;
        mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
         */
        BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+        status = ocfs2_read_inode_block(inode, &bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-                status = -EIO;
-                goto leave;
-        }
 restart_all:
        BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
        }
 restarted_transaction:
+        if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+            clusters_to_add))) {
+                status = -EDQUOT;
+                goto leave;
+        }
+        did_quota = 1;
        /* reserve a write to the file entry early on - that we if we
         * run out of credits in the allocation path, we can still
         * update i_size. */
-        status = ocfs2_journal_access(handle, inode, bh,
+        status = ocfs2_journal_access_di(handle, inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
        spin_lock(&OCFS2_I(inode)->ip_lock);
        clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
        spin_unlock(&OCFS2_I(inode)->ip_lock);
+        /* Release unused quota reservation */
+        vfs_dq_free_space(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+        did_quota = 0;
        if (why != RESTART_NONE && clusters_to_add) {
                if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
             OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 leave:
+        if (status < 0 && did_quota)
+                vfs_dq_free_space(inode,
+                        ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
        if (handle) {
                ocfs2_commit_trans(osb, handle);
                handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        struct ocfs2_super *osb = OCFS2_SB(sb);
        struct buffer_head *bh = NULL;
        handle_t *handle = NULL;
+        int locked[MAXQUOTAS] = {0, 0};
+        int credits, qtype;
+        struct ocfs2_mem_dqinfo *oinfo;
        mlog_entry("(0x%p, '%.*s')\n", dentry,
                   dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+        if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-        if (IS_ERR(handle)) {
+            (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                status = PTR_ERR(handle);
+                credits = OCFS2_INODE_UPDATE_CREDITS;
-                mlog_errno(status);
+                if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
-                goto bail_unlock;
+                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                        oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0)
+                                goto bail_unlock;
+                        credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+                                ocfs2_calc_qdel_credits(sb, USRQUOTA);
+                        locked[USRQUOTA] = 1;
+                }
+                if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+                    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                        oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0)
+                                goto bail_unlock;
+                        credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+                                   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
+                        locked[GRPQUOTA] = 1;
+                }
+                handle = ocfs2_start_trans(osb, credits);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail_unlock;
+                }
+                status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+                if (status < 0)
+                        goto bail_commit;
+        } else {
+                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto bail_unlock;
+                }
        }
        /*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
        ocfs2_commit_trans(osb, handle);
 bail_unlock:
+        for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+                if (!locked[qtype])
+                        continue;
+                oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
+                ocfs2_unlock_global_qf(oinfo, 1);
+        }
        ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
        if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
 bail:
        brelse(bh);
+        if (!status && attr->ia_valid & ATTR_MODE) {
+                status = ocfs2_acl_chmod(inode);
+                if (status < 0)
+                        mlog_errno(status);
+        }
        mlog_exit(status);
        return status;
 }
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
                goto out;
        }
-        ret = generic_permission(inode, mask, NULL);
+        ret = generic_permission(inode, mask, ocfs2_check_acl);
        ocfs2_inode_unlock(inode, 0);
 out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, bh,
+        ret = ocfs2_journal_access_di(handle, inode, bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 {
        int ret;
        struct buffer_head *bh = NULL;
-        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
+        ret = ocfs2_read_inode_block(inode, &bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
        struct buffer_head *di_bh = NULL;
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-                ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+                ret = ocfs2_read_inode_block(inode, &di_bh);
-                                       &di_bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -1226,83 +1284,6 @@ out:
        return ret;
 }
-static int __ocfs2_remove_inode_range(struct inode *inode,
-                                      struct buffer_head *di_bh,
-                                      u32 cpos, u32 phys_cpos, u32 len,
-                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
-{
-        int ret;
-        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct inode *tl_inode = osb->osb_tl_inode;
-        handle_t *handle;
-        struct ocfs2_alloc_context *meta_ac = NULL;
-        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-        struct ocfs2_extent_tree et;
-        ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
-        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
-        if (ret) {
-                mlog_errno(ret);
-                return ret;
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                ret = __ocfs2_flush_truncate_log(osb);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-                                  dealloc);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        OCFS2_I(inode)->ip_clusters -= len;
-        di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
-        ret = ocfs2_journal_dirty(handle, di_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
-        if (ret)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
-out:
-        mutex_unlock(&tl_inode->i_mutex);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
-        return ret;
-}
 /*
 * Truncate a byte range, avoiding pages within partial clusters. This
 * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_cached_dealloc_ctxt dealloc;
        struct address_space *mapping = inode->i_mapping;
+        struct ocfs2_extent_tree et;
+        ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
        ocfs2_init_dealloc_ctxt(&dealloc);
        if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
                /* Only do work for non-holes */
                if (phys_cpos != 0) {
-                        ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
+                        ret = ocfs2_remove_btree_range(inode, &et, cpos,
-                                                         phys_cpos, alloc_size,
+                                                       phys_cpos, alloc_size,
-                                                         &dealloc);
+                                                       &dealloc);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
                            struct ocfs2_space_resv *sr)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
            !ocfs2_writes_unwritten_extents(osb))
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5f..172f9fbc9fc7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
                         struct ocfs2_alloc_context *data_ac,
                         struct ocfs2_alloc_context *meta_ac,
                         enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+                             struct buffer_head *di_bh,
+                             u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
                          u64 zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d511874..229e707bc050 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 #include <asm/byteorder.h>
@@ -37,6 +38,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
        return 0;
 }
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-                         int create_ino)
+                          int create_ino)
 {
        struct super_block *sb;
        struct ocfs2_super *osb;
-        int status = -EINVAL;
        int use_plocks = 1;
        mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
            ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
                use_plocks = 0;
-        /* this means that read_inode cannot create a superblock inode
+        /*
-         * today.  change if needed. */
+         * These have all been checked by ocfs2_read_inode_block() or set
-        if (!OCFS2_IS_VALID_DINODE(fe) ||
+         * by ocfs2_mknod_locked(), so a failure is a code bug.
-            !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+         */
-                mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));  /* This means that read_inode
-                     "signature = %.*s, flags = 0x%x\n",
+                                                cannot create a superblock
-                     inode->i_ino,
+                                                inode today.  change if
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
+                                                that is needed. */
-                     fe->i_signature, le32_to_cpu(fe->i_flags));
+        BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
-                goto bail;
+        BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
-        }
-        if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
-                mlog(ML_ERROR, "file entry generation does not match "
-                     "superblock! osb->fs_generation=%x, "
-                     "fe->i_fs_generation=%x\n",
-                     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
-                goto bail;
-        }
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
-        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+        if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+                inode->i_flags |= S_NOQUOTA;
+        }
        if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
                mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
        } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+        } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+                inode->i_flags |= S_NOQUOTA;
        } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
                mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
                /* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        ocfs2_set_inode_flags(inode);
-        status = 0;
+        mlog_exit_void();
-bail:
-        mlog_exit(status);
-        return status;
 }
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
                }
        }
-        if (can_lock)
+        if (can_lock) {
-                status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+                status = ocfs2_read_inode_block_full(inode, &bh,
-                                           OCFS2_BH_IGNORE_CACHE);
+                                                     OCFS2_BH_IGNORE_CACHE);
-        else
+        } else {
                status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+                if (!status)
+                        status = ocfs2_validate_inode_block(osb->sb, bh);
+        }
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        status = -EINVAL;
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
-                     (unsigned long long)args->fi_blkno, 7,
-                     fe->i_signature);
-                goto bail;
-        }
        /*
         * This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
        if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
            S_ISBLK(le16_to_cpu(fe->i_mode)))
-                inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+                inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
-        if (ocfs2_populate_inode(inode, fe, 0) < 0)
+        ocfs2_populate_inode(inode, fe, 0);
-                goto bail;
        BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                        goto out;
                }
-                status = ocfs2_journal_access(handle, inode, fe_bh,
+                status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
                goto bail;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS);
+        handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+                                        ocfs2_quota_trans_credits(inode->i_sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        /* set the inodes dtime */
-        status = ocfs2_journal_access(handle, inode, di_bh,
+        status = ocfs2_journal_access_di(handle, inode, di_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        }
        ocfs2_remove_from_cache(inode, di_bh);
+        vfs_dq_free_inode(inode);
        status = ocfs2_free_dinode(handle, inode_alloc_inode,
                                   inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
-        if (is_bad_inode(inode)) {
+        /* When we fail in read_inode() we mark inode as bad. The second test
+         * catches the case when inode allocation fails before allocating
+         * a block for inode. */
+        if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
                mlog(0, "Skipping delete of bad inode\n");
                goto bail;
        }
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
        mlog_entry("(inode %llu)\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        status = ocfs2_journal_access(handle, inode, bh,
+        status = ocfs2_journal_access_di(handle, inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
+int ocfs2_validate_inode_block(struct super_block *sb,
+                               struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+        mlog(0, "Validating dinode %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+        if (rc) {
+                mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+                goto bail;
+        }
+        /*
+         * Errors after here are fatal.
+         */
+        rc = -EINVAL;
+        if (!OCFS2_IS_VALID_DINODE(di)) {
+                ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            di->i_signature);
+                goto bail;
+        }
+        if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(di->i_blkno));
+                goto bail;
+        }
+        if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+                ocfs2_error(sb,
+                            "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+                            (unsigned long long)bh->b_blocknr);
+                goto bail;
+        }
+        if (le32_to_cpu(di->i_fs_generation) !=
+            OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Invalid dinode #%llu: fs_generation is %u\n",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(di->i_fs_generation));
+                goto bail;
+        }
+        rc = 0;
+bail:
+        return rc;
+}
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+                                int flags)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
+                               flags, ocfs2_validate_inode_block);
+        /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+        return ocfs2_read_inode_block_full(inode, bh, 0);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4a..eb3c302b38d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
                         int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-                         int create_ino);
+                          int create_ino);
 void ocfs2_read_inode(struct inode *inode);
 void ocfs2_read_inode2(struct inode *inode, void *opaque);
 ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
                           struct buffer_head *bh);
 int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+                                int block, int *err, int reada);
 void ocfs2_set_inode_flags(struct inode *inode);
 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
        return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
 }
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+                               struct buffer_head *bh);
+/*
+ * Read an inode block into *bh.  If *bh is NULL, a bh will be allocated.
+ * This is a cached read.  The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+                                int flags);
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3c..57d7d25a2b9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -45,6 +46,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "quota.h"
 #include "buffer_head_io.h"
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-                              int node_num);
+                              int node_num, int slot_num);
 static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
                                      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                 int slot);
 static int ocfs2_commit_thread(void *arg);
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+        return __ocfs2_wait_on_mount(osb, 0);
+}
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+        return __ocfs2_wait_on_mount(osb, 1);
+}
 /*
 * The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
        BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
        BUG_ON(max_buffs <= 0);
-        /* JBD might support this, but our journalling code doesn't yet. */
+        /* Nested transaction? Just return the handle... */
-        if (journal_current_handle()) {
+        if (journal_current_handle())
-                mlog(ML_ERROR, "Recursive transaction attempted!\n");
+                return jbd2_journal_start(journal, max_buffs);
-                BUG();
-        }
        down_read(&osb->journal->j_trans_barrier);
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 int ocfs2_commit_trans(struct ocfs2_super *osb,
                       handle_t *handle)
 {
-        int ret;
+        int ret, nested;
        struct ocfs2_journal *journal = osb->journal;
        BUG_ON(!handle);
+        nested = handle->h_ref > 1;
        ret = jbd2_journal_stop(handle);
        if (ret < 0)
                mlog_errno(ret);
-        up_read(&journal->j_trans_barrier);
+        if (!nested)
+                up_read(&journal->j_trans_barrier);
        return ret;
 }
@@ -357,10 +370,137 @@ bail:
        return status;
 }
-int ocfs2_journal_access(handle_t *handle,
+struct ocfs2_triggers {
-                         struct inode *inode,
+        struct jbd2_buffer_trigger_type ot_triggers;
-                         struct buffer_head *bh,
+        int                             ot_offset;
-                         int type)
+};
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+        return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_disk_dqtrailer *dqt =
+                ocfs2_block_dqtrailer(size, data);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                 struct buffer_head *bh,
+                                 void *data, size_t size)
+{
+        struct ocfs2_dir_block_trailer *trailer =
+                ocfs2_dir_trailer_from_size(size, data);
+        /*
+         * We aren't guaranteed to have the superblock here, so we
+         * must unconditionally compute the ecc data.
+         * __ocfs2_journal_access() will only set the triggers if
+         * metaecc is enabled.
+         */
+        ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+                                struct buffer_head *bh)
+{
+        mlog(ML_ERROR,
+             "ocfs2_abort_trigger called by JBD2.  bh = 0x%lx, "
+             "bh->b_blocknr = %llu\n",
+             (unsigned long)bh,
+             (unsigned long long)bh->b_blocknr);
+        /* We aren't guaranteed to have the superblock here - but if we
+         * don't, it'll just crash. */
+        ocfs2_error(bh->b_assoc_map->host->i_sb,
+                    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+static struct ocfs2_triggers di_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_dinode, i_check),
+};
+static struct ocfs2_triggers eb_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_extent_block, h_check),
+};
+static struct ocfs2_triggers gd_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_group_desc, bg_check),
+};
+static struct ocfs2_triggers db_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_db_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+};
+static struct ocfs2_triggers xb_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+        .ot_offset      = offsetof(struct ocfs2_xattr_block, xb_check),
+};
+static struct ocfs2_triggers dq_triggers = {
+        .ot_triggers = {
+                .t_commit = ocfs2_dq_commit_trigger,
+                .t_abort = ocfs2_abort_trigger,
+        },
+};
+static int __ocfs2_journal_access(handle_t *handle,
+                                  struct inode *inode,
+                                  struct buffer_head *bh,
+                                  struct ocfs2_triggers *triggers,
+                                  int type)
 {
        int status;
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
                status = -EINVAL;
                mlog(ML_ERROR, "Uknown access type!\n");
        }
+        if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+                jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
        mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
        if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
        return status;
 }
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+                               struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
+                                      type);
+}
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
+                                      type);
+}
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+                                      type);
+}
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
+                                      type);
+}
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
+                                      type);
+}
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
+                                      type);
+}
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+                         struct buffer_head *bh, int type)
+{
+        return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
 int ocfs2_journal_dirty(handle_t *handle,
                        struct buffer_head *bh)
 {
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
        return status;
 }
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
-                             struct buffer_head *bh)
-{
-        int err = journal_dirty_data(handle, bh);
-        if (err)
-                mlog_errno(err);
-        /* TODO: When we can handle it, abort the handle and go RO on
-         * error here. */
-        return err;
-}
-#endif
 #define OCFS2_DEFAULT_COMMIT_INTERVAL   (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        mlog_entry_void();
        fe = (struct ocfs2_dinode *)bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                /* This is called from startup/shutdown which will
+        /* The journal bh on the osb always comes from ocfs2_journal_init()
-                 * handle the errors in a specific manner, so no need
+         * and was validated there inside ocfs2_inode_lock_full().  It's a
-                 * to call ocfs2_error() here. */
+         * code bug if we mess it up. */
-                mlog(ML_ERROR, "Journal dinode %llu  has invalid "
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-                     "signature: %.*s",
-                     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-                     fe->i_signature);
-                status = -EIO;
-                goto out;
-        }
        flags = le32_to_cpu(fe->id1.journal1.ij_flags);
        if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
        if (replayed)
                ocfs2_bump_recovery_generation(fe);
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
        status = ocfs2_write_block(osb, bh, journal->j_inode);
        if (status < 0)
                mlog_errno(status);
-out:
        mlog_exit(status);
        return status;
 }
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
        int                     lri_slot;
        struct ocfs2_dinode     *lri_la_dinode;
        struct ocfs2_dinode     *lri_tl_dinode;
+        struct ocfs2_quota_recovery *lri_qrec;
 };
 /* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
        struct ocfs2_super *osb = journal->j_osb;
        struct ocfs2_dinode *la_dinode, *tl_dinode;
        struct ocfs2_la_recovery_item *item, *n;
+        struct ocfs2_quota_recovery *qrec;
        LIST_HEAD(tmp_la_list);
        mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
                mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+                ocfs2_wait_on_quotas(osb);
                la_dinode = item->lri_la_dinode;
                if (la_dinode) {
                        mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
                if (ret < 0)
                        mlog_errno(ret);
+                qrec = item->lri_qrec;
+                if (qrec) {
+                        mlog(0, "Recovering quota files");
+                        ret = ocfs2_finish_quota_recovery(osb, qrec,
+                                                          item->lri_slot);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                        /* Recovery info is already freed now */
+                }
                kfree(item);
        }
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
-                                            struct ocfs2_dinode *tl_dinode)
+                                            struct ocfs2_dinode *tl_dinode,
+                                            struct ocfs2_quota_recovery *qrec)
 {
        struct ocfs2_la_recovery_item *item;
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                if (tl_dinode)
                        kfree(tl_dinode);
+                if (qrec)
+                        ocfs2_free_quota_recovery(qrec);
                mlog_errno(-ENOMEM);
                return;
        }
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
        item->lri_la_dinode = la_dinode;
        item->lri_slot = slot_num;
        item->lri_tl_dinode = tl_dinode;
+        item->lri_qrec = qrec;
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
                ocfs2_queue_recovery_completion(journal,
                                                osb->slot_num,
                                                osb->local_alloc_copy,
+                                                NULL,
                                                NULL);
                ocfs2_schedule_truncate_log_flush(osb, 0);
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        }
 }
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+        if (osb->quota_rec) {
+                ocfs2_queue_recovery_completion(osb->journal,
+                                                osb->slot_num,
+                                                NULL,
+                                                NULL,
+                                                osb->quota_rec);
+                osb->quota_rec = NULL;
+        }
+}
 static int __ocfs2_recovery_thread(void *arg)
 {
-        int status, node_num;
+        int status, node_num, slot_num;
        struct ocfs2_super *osb = arg;
        struct ocfs2_recovery_map *rm = osb->recovery_map;
+        int *rm_quota = NULL;
+        int rm_quota_used = 0, i;
+        struct ocfs2_quota_recovery *qrec;
        mlog_entry_void();
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
                goto bail;
        }
+        rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+        if (!rm_quota) {
+                status = -ENOMEM;
+                goto bail;
+        }
 restart:
        status = ocfs2_super_lock(osb, 1);
        if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
                 * clear it until ocfs2_recover_node() has succeeded. */
                node_num = rm->rm_entries[0];
                spin_unlock(&osb->osb_lock);
+                mlog(0, "checking node %d\n", node_num);
-                status = ocfs2_recover_node(osb, node_num);
+                slot_num = ocfs2_node_num_to_slot(osb, node_num);
+                if (slot_num == -ENOENT) {
+                        status = 0;
+                        mlog(0, "no slot for this node, so no recovery"
+                             "required.\n");
+                        goto skip_recovery;
+                }
+                mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+                /* It is a bit subtle with quota recovery. We cannot do it
+                 * immediately because we have to obtain cluster locks from
+                 * quota files and we also don't want to just skip it because
+                 * then quota usage would be out of sync until some node takes
+                 * the slot. So we remember which nodes need quota recovery
+                 * and when everything else is done, we recover quotas. */
+                for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+                if (i == rm_quota_used)
+                        rm_quota[rm_quota_used++] = slot_num;
+                status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
                if (!status) {
                        ocfs2_recovery_map_clear(osb, node_num);
                } else {
@@ -1055,13 +1285,27 @@ restart:
        if (status < 0)
                mlog_errno(status);
+        /* Now it is right time to recover quotas... We have to do this under
+         * superblock lock so that noone can start using the slot (and crash)
+         * before we recover it */
+        for (i = 0; i < rm_quota_used; i++) {
+                qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+                if (IS_ERR(qrec)) {
+                        status = PTR_ERR(qrec);
+                        mlog_errno(status);
+                        continue;
+                }
+                ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+                                                NULL, NULL, qrec);
+        }
        ocfs2_super_unlock(osb, 1);
        /* We always run recovery on our own orphan dir - the dead
         * node(s) may have disallowd a previos inode delete. Re-processing
         * is therefore required. */
        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                        NULL);
+                                        NULL, NULL);
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
        mutex_unlock(&osb->recovery_lock);
+        if (rm_quota)
+                kfree(rm_quota);
        mlog_exit(status);
        /* no one is callint kthread_stop() for us so the kthread() api
         * requires that we call do_exit().  And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
        }
        SET_INODE_JOURNAL(inode);
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+        status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
-                                   OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
        osb->slot_recovery_generations[slot_num] =
                                        ocfs2_get_recovery_generation(fe);
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
        status = ocfs2_write_block(osb, bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
 * far less concerning.
 */
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-                              int node_num)
+                              int node_num, int slot_num)
 {
        int status = 0;
-        int slot_num;
        struct ocfs2_dinode *la_copy = NULL;
        struct ocfs2_dinode *tl_copy = NULL;
-        mlog_entry("(node_num=%d, osb->node_num = %d)\n",
+        mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
-                   node_num, osb->node_num);
+                   node_num, slot_num, osb->node_num);
-        mlog(0, "checking node %d\n", node_num);
        /* Should not ever be called to recover ourselves -- in that
         * case we should've called ocfs2_journal_load instead. */
        BUG_ON(osb->node_num == node_num);
-        slot_num = ocfs2_node_num_to_slot(osb, node_num);
-        if (slot_num == -ENOENT) {
-                status = 0;
-                mlog(0, "no slot for this node, so no recovery required.\n");
-                goto done;
-        }
-        mlog(0, "node %d was using slot %d\n", node_num, slot_num);
        status = ocfs2_replay_journal(osb, node_num, slot_num);
        if (status < 0) {
                if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        /* This will kfree the memory pointed to by la_copy and tl_copy */
        ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-                                        tl_copy);
+                                        tl_copy, NULL);
        status = 0;
 done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
        return ret;
 }
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
 {
        /* This check is good because ocfs2 will wait on our recovery
         * thread before changing it to something other than MOUNTED
         * or DISABLED. */
        wait_event(osb->osb_mount_event,
-                   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+                  (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+                   atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
                   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
        /* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3cea..3c3532e1307c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
 #define OCFS2_JOURNAL_H
 #include <linux/fs.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
+#include <linux/jbd2.h>
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
 enum ocfs2_journal_state {
        OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void   ocfs2_recovery_thread(struct ocfs2_super *osb,
                             int node_num);
 int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
 void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
 *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
 *                          commit the handle to disk in the process, but will
 *                          not release any locks taken during the transaction.
- *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *  ocfs2_journal_access* - Notify the handle that we want to journal this
 *                          buffer. Will have to call ocfs2_journal_dirty once
 *                          we've actually dirtied it. Type is one of . or .
+ *                          Always call the specific flavor of
+ *                          ocfs2_journal_access_*() unless you intend to
+ *                          manage the checksum by hand.
 *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
 *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
 *                           the current handle commits.
@@ -248,10 +247,29 @@ int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
 #define OCFS2_JOURNAL_ACCESS_WRITE  1
 #define OCFS2_JOURNAL_ACCESS_UNDO   2
-int                  ocfs2_journal_access(handle_t *handle,
-                                          struct inode *inode,
+/* ocfs2_inode */
-                                          struct buffer_head *bh,
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
-                                          int type);
+                            struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+                            struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+                         struct buffer_head *bh, int type);
 /*
 * A word about the journal_access/journal_dirty "dance". It is
 * entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int                  ocfs2_journal_access(handle_t *handle,
 */
 int                  ocfs2_journal_dirty(handle_t *handle,
                                         struct buffer_head *bh);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int                  ocfs2_journal_dirty_data(handle_t *handle,
-                                              struct buffer_head *bh);
-#endif
 /*
 *  Credit Macros:
@@ -293,6 +307,37 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+        int credits = 0;
+        if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+                credits += OCFS2_QWRITE_CREDITS;
+        if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+                credits += OCFS2_QWRITE_CREDITS;
+        return credits;
+}
+/* Number of credits needed for removing quota structure from file */
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
+/* Number of credits needed for initialization of new quota structure */
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -303,8 +348,11 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
-#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC           \
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
-                                         + OCFS2_INODE_UPDATE_CREDITS)
+{
+        return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* dinode + group descriptor update. We don't relink on free yet. */
 #define OCFS2_SUBALLOC_FREE  (2)
@@ -313,16 +361,23 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE                 \
                                         + OCFS2_TRUNCATE_LOG_UPDATE)
-#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+        return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
 * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
 /* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
+ * group descriptor + mkdir/symlink blocks + quota update */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
+static inline int ocfs2_mknod_credits(struct super_block *sb)
-                            + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+{
+        return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+               ocfs2_quota_trans_credits(sb);
+}
 /* local alloc metadata change + main bitmap updates */
 #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
@@ -332,13 +387,21 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 * for the dinode, one for the new block. */
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
-/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
-#define OCFS2_LINK_CREDITS  (2*OCFS2_INODE_UPDATE_CREDITS + 1)
+ * update on dir */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+        return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+               ocfs2_quota_trans_credits(sb);
+}
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
 * dir inode link */
-#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
+static inline int ocfs2_unlink_credits(struct super_block *sb)
-                              + OCFS2_LINK_CREDITS)
+{
+        /* The quota update from ocfs2_link_credits is unused here... */
+        return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+}
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
 * inode alloc group descriptor */
@@ -347,8 +410,10 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
 * directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
+static inline int ocfs2_rename_credits(struct super_block *sb)
-                             + OCFS2_UNLINK_CREDITS)
+{
+        return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+}
 /* global bitmap dinode, group desc., relinked group,
 * suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
         * credit for the dinode there. */
        extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
-        return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
+        return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+               ocfs2_quota_trans_credits(sb);
 }
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-        int blocks = OCFS2_MKNOD_CREDITS;
+        int blocks = ocfs2_mknod_credits(sb);
        /* links can be longer than one block so we may update many
         * within our single allocated extent. */
        blocks += ocfs2_clusters_to_blocks(sb, 1);
-        return blocks;
+        return blocks + ocfs2_quota_trans_credits(sb);
 }
 static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
        /* update to the truncate log. */
        credits += OCFS2_TRUNCATE_LOG_UPDATE;
+        credits += ocfs2_quota_trans_credits(sb);
        return credits;
 }
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c32..ec70cdbe77fc 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
                goto bail;
        }
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+        status = ocfs2_read_inode_block_full(inode, &alloc_bh,
-                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+                                             OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
        }
        memcpy(alloc_copy, alloc, bh->b_size);
-        status = ocfs2_journal_access(handle, local_alloc_inode, bh,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        mutex_lock(&inode->i_mutex);
-        status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+        status = ocfs2_read_inode_block_full(inode, &alloc_bh,
-                                   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+                                             OCFS2_BH_IGNORE_CACHE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
        alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
        ocfs2_clear_local_alloc(alloc);
+        ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
        status = ocfs2_write_block(osb, alloc_bh, inode);
        if (status < 0)
                mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
         * delete bits from it! */
        *num_bits = bits_wanted;
-        status = ocfs2_journal_access(handle, local_alloc_inode,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode,
-                                      osb->local_alloc_bh,
+                                         osb->local_alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
        }
        memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
-        status = ocfs2_journal_access(handle, local_alloc_inode,
+        status = ocfs2_journal_access_di(handle, local_alloc_inode,
-                                      osb->local_alloc_bh,
+                                         osb->local_alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402efe..084aba86c3b2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -61,17 +62,18 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "xattr.h"
+#include "acl.h"
 #include "buffer_head_io.h"
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                              struct inode *dir,
-                              struct dentry *dentry, int mode,
+                              struct inode *inode,
+                              struct dentry *dentry,
                              dev_t dev,
                              struct buffer_head **new_fe_bh,
                              struct buffer_head *parent_fe_bh,
                              handle_t *handle,
-                              struct inode **ret_inode,
                              struct ocfs2_alloc_context *inode_ac);
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
        return ret;
 }
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+        struct inode *inode;
+        inode = new_inode(dir->i_sb);
+        if (!inode) {
+                mlog(ML_ERROR, "new_inode failed!\n");
+                return NULL;
+        }
+        /* populate as many fields early on as possible - many of
+         * these are used by the support functions here and in
+         * callers. */
+        if (S_ISDIR(mode))
+                inode->i_nlink = 2;
+        else
+                inode->i_nlink = 1;
+        inode->i_uid = current_fsuid();
+        if (dir->i_mode & S_ISGID) {
+                inode->i_gid = dir->i_gid;
+                if (S_ISDIR(mode))
+                        mode |= S_ISGID;
+        } else
+                inode->i_gid = current_fsgid();
+        inode->i_mode = mode;
+        vfs_dq_init(inode);
+        return inode;
+}
 static int ocfs2_mknod(struct inode *dir,
                       struct dentry *dentry,
                       int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
        struct inode *inode = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *xattr_ac = NULL;
+        int want_clusters = 0;
+        int xattr_credits = 0;
+        struct ocfs2_security_xattr_info si = {
+                .enable = 1,
+        };
+        int did_quota_inode = 0;
        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
                   (unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
-        /* Reserve a cluster if creating an extent based directory. */
+        inode = ocfs2_get_init_inode(dir, mode);
-        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
+        if (!inode) {
-                status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+                status = -ENOMEM;
-                if (status < 0) {
+                mlog_errno(status);
-                        if (status != -ENOSPC)
+                goto leave;
-                                mlog_errno(status);
+        }
+        /* get security xattr */
+        status = ocfs2_init_security_get(inode, dir, &si);
+        if (status) {
+                if (status == -EOPNOTSUPP)
+                        si.enable = 0;
+                else {
+                        mlog_errno(status);
                        goto leave;
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
+        /* calculate meta data/clusters for setting security and acl xattr */
+        status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+                                        &si, &want_clusters,
+                                        &xattr_credits, &xattr_ac);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        /* Reserve a cluster if creating an extent based directory. */
+        if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+                want_clusters += 1;
+        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto leave;
+        }
+        handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+                                   xattr_credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+         * to be called. */
+        if (sb_any_quota_active(osb->sb) &&
+            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+                status = -EDQUOT;
+                goto leave;
+        }
+        did_quota_inode = 1;
        /* do the real work now. */
-        status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+        status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
                                    &new_fe_bh, parent_fe_bh, handle,
-                                    &inode, inode_ac);
+                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
                        goto leave;
                }
-                status = ocfs2_journal_access(handle, dir, parent_fe_bh,
+                status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
                inc_nlink(dir);
        }
+        status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+                                xattr_ac, data_ac);
+        if (status < 0) {
+                mlog_errno(status);
+                goto leave;
+        }
+        if (si.enable) {
+                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+                                                 xattr_ac, data_ac);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto leave;
+                }
+        }
        status = ocfs2_add_entry(handle, dentry, inode,
                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
                                 de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
        d_instantiate(dentry, inode);
        status = 0;
 leave:
+        if (status < 0 && did_quota_inode)
+                vfs_dq_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -331,9 +425,13 @@ leave:
        brelse(new_fe_bh);
        brelse(de_bh);
        brelse(parent_fe_bh);
+        kfree(si.name);
+        kfree(si.value);
-        if ((status < 0) && inode)
+        if ((status < 0) && inode) {
+                clear_nlink(inode);
                iput(inode);
+        }
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (xattr_ac)
+                ocfs2_free_alloc_context(xattr_ac);
        mlog_exit(status);
        return status;
@@ -348,12 +449,12 @@ leave:
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                              struct inode *dir,
-                              struct dentry *dentry, int mode,
+                              struct inode *inode,
+                              struct dentry *dentry,
                              dev_t dev,
                              struct buffer_head **new_fe_bh,
                              struct buffer_head *parent_fe_bh,
                              handle_t *handle,
-                              struct inode **ret_inode,
                              struct ocfs2_alloc_context *inode_ac)
 {
        int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        struct ocfs2_extent_list *fel;
        u64 fe_blkno = 0;
        u16 suballoc_bit;
-        struct inode *inode = NULL;
-        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+        mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-                   (unsigned long)dev, dentry->d_name.len,
+                   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
                   dentry->d_name.name);
        *new_fe_bh = NULL;
-        *ret_inode = NULL;
        status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
                                       &fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                goto leave;
        }
-        inode = new_inode(dir->i_sb);
-        if (!inode) {
-                status = -ENOMEM;
-                mlog(ML_ERROR, "new_inode failed!\n");
-                goto leave;
-        }
        /* populate as many fields early on as possible - many of
         * these are used by the support functions here and in
         * callers. */
        inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
        OCFS2_I(inode)->ip_blkno = fe_blkno;
-        if (S_ISDIR(mode))
-                inode->i_nlink = 2;
-        else
-                inode->i_nlink = 1;
-        inode->i_mode = mode;
        spin_lock(&osb->osb_lock);
        inode->i_generation = osb->s_next_generation++;
        spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        }
        ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
-        status = ocfs2_journal_access(handle, inode, *new_fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        fe->i_blkno = cpu_to_le64(fe_blkno);
        fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
        fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
-        fe->i_uid = cpu_to_le32(current_fsuid());
+        fe->i_uid = cpu_to_le32(inode->i_uid);
-        if (dir->i_mode & S_ISGID) {
+        fe->i_gid = cpu_to_le32(inode->i_gid);
-                fe->i_gid = cpu_to_le32(dir->i_gid);
+        fe->i_mode = cpu_to_le16(inode->i_mode);
-                if (S_ISDIR(mode))
+        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-                        mode |= S_ISGID;
-        } else
-                fe->i_gid = cpu_to_le32(current_fsgid());
-        fe->i_mode = cpu_to_le16(mode);
-        if (S_ISCHR(mode) || S_ISBLK(mode))
                fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
        fe->i_links_count = cpu_to_le16(inode->i_nlink);
        fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        /*
         * If supported, directories start with inline data.
         */
-        if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+        if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
                u16 feat = le16_to_cpu(fe->i_dyn_features);
                fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
                goto leave;
        }
-        if (ocfs2_populate_inode(inode, fe, 1) < 0) {
+        ocfs2_populate_inode(inode, fe, 1);
-                mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
-                     "i_blkno=%llu, i_ino=%lu\n",
-                     (unsigned long long)(*new_fe_bh)->b_blocknr,
-                     (unsigned long long)le64_to_cpu(fe->i_blkno),
-                     inode->i_ino);
-                BUG();
-        }
        ocfs2_inode_set_new(osb, inode);
        if (!ocfs2_mount_local(osb)) {
                status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
        status = 0; /* error in ocfs2_create_new_inode_locks is not
                     * critical */
-        *ret_inode = inode;
 leave:
        if (status < 0) {
                if (*new_fe_bh) {
                        brelse(*new_fe_bh);
                        *new_fe_bh = NULL;
                }
-                if (inode) {
-                        clear_nlink(inode);
-                        iput(inode);
-                }
        }
        mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
-        handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto out_unlock_inode;
        }
-        err = ocfs2_journal_access(handle, inode, fe_bh,
+        err = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (err < 0) {
                mlog_errno(err);
                goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
                goto leave;
        }
-        status = ocfs2_journal_access(handle, inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
                                goto bail;
                        }
                }
-                status = ocfs2_journal_access(handle, new_inode, newfe_bh,
+                status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
-                                              OCFS2_JOURNAL_ACCESS_WRITE);
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
        old_inode->i_ctime = CURRENT_TIME;
        mark_inode_dirty(old_inode);
-        status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
+        status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status >= 0) {
                old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
                             (int)old_dir_nlink, old_dir->i_nlink);
                } else {
                        struct ocfs2_dinode *fe;
-                        status = ocfs2_journal_access(handle, old_dir,
+                        status = ocfs2_journal_access_di(handle, old_dir,
-                                                      old_dir_bh,
+                                                         old_dir_bh,
-                                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                                         OCFS2_JOURNAL_ACCESS_WRITE);
                        fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
                        fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
                        status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
        handle_t *handle = NULL;
        struct ocfs2_alloc_context *inode_ac = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
+        struct ocfs2_alloc_context *xattr_ac = NULL;
+        int want_clusters = 0;
+        int xattr_credits = 0;
+        struct ocfs2_security_xattr_info si = {
+                .enable = 1,
+        };
+        int did_quota = 0, did_quota_inode = 0;
        mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
                   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
-        /* don't reserve bitmap space for fast symlinks. */
+        inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
-        if (l > ocfs2_fast_symlink_chars(sb)) {
+        if (!inode) {
-                status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto bail;
+        }
+        /* get security xattr */
+        status = ocfs2_init_security_get(inode, dir, &si);
+        if (status) {
+                if (status == -EOPNOTSUPP)
+                        si.enable = 0;
+                else {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
+        /* calculate meta data/clusters for setting security xattr */
+        if (si.enable) {
+                status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+                                                  &xattr_credits, &xattr_ac);
                if (status < 0) {
-                        if (status != -ENOSPC)
+                        mlog_errno(status);
-                                mlog_errno(status);
                        goto bail;
                }
        }
-        handle = ocfs2_start_trans(osb, credits);
+        /* don't reserve bitmap space for fast symlinks. */
+        if (l > ocfs2_fast_symlink_chars(sb))
+                want_clusters += 1;
+        status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+        if (status < 0) {
+                if (status != -ENOSPC)
+                        mlog_errno(status);
+                goto bail;
+        }
+        handle = ocfs2_start_trans(osb, credits + xattr_credits);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
                goto bail;
        }
-        status = ocfs2_mknod_locked(osb, dir, dentry,
+        /* We don't use standard VFS wrapper because we don't want vfs_dq_init
-                                    S_IFLNK | S_IRWXUGO, 0,
+         * to be called. */
-                                    &new_fe_bh, parent_fe_bh, handle,
+        if (sb_any_quota_active(osb->sb) &&
-                                    &inode, inode_ac);
+            osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+                status = -EDQUOT;
+                goto bail;
+        }
+        did_quota_inode = 1;
+        status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+                                    0, &new_fe_bh, parent_fe_bh, handle,
+                                    inode_ac);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
                u32 offset = 0;
                inode->i_op = &ocfs2_symlink_inode_operations;
+                if (vfs_dq_alloc_space_nodirty(inode,
+                    ocfs2_clusters_to_bytes(osb->sb, 1))) {
+                        status = -EDQUOT;
+                        goto bail;
+                }
+                did_quota = 1;
                status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
                                              new_fe_bh,
                                              handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
                }
        }
+        if (si.enable) {
+                status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+                                                 xattr_ac, data_ac);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto bail;
+                }
+        }
        status = ocfs2_add_entry(handle, dentry, inode,
                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
                                 de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
 bail:
+        if (status < 0 && did_quota)
+                vfs_dq_free_space_nodirty(inode,
+                                        ocfs2_clusters_to_bytes(osb->sb, 1));
+        if (status < 0 && did_quota_inode)
+                vfs_dq_free_inode(inode);
        if (handle)
                ocfs2_commit_trans(osb, handle);
@@ -1640,12 +1772,18 @@ bail:
        brelse(new_fe_bh);
        brelse(parent_fe_bh);
        brelse(de_bh);
+        kfree(si.name);
+        kfree(si.value);
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
-        if ((status < 0) && inode)
+        if (xattr_ac)
+                ocfs2_free_alloc_context(xattr_ac);
+        if ((status < 0) && inode) {
+                clear_nlink(inode);
                iput(inode);
+        }
        mlog_exit(status);
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
-        status = ocfs2_read_block(orphan_dir_inode,
+        status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
-                                  OCFS2_I(orphan_dir_inode)->ip_blkno,
-                                  &orphan_dir_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
+        status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                goto leave;
        }
-        status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
+        status = ocfs2_journal_access_di(handle,orphan_dir_inode,  orphan_dir_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d8992..ad5c24a29edd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
 {
        VOLUME_INIT = 0,
        VOLUME_MOUNTED,
+        VOLUME_MOUNTED_QUOTAS,
        VOLUME_DISMOUNTED,
        VOLUME_DISABLED
 };
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
        OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
        OCFS2_MOUNT_INODE64 = 1 << 7,   /* Allow inode numbers > 2^32 */
+        OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
+        OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+        OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
@@ -205,6 +209,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_quota_recovery;
 struct ocfs2_super
 {
        struct task_struct *commit_task;
@@ -286,10 +291,11 @@ struct ocfs2_super
        char *local_alloc_debug_buf;
 #endif
-        /* Next two fields are for local node slot recovery during
+        /* Next three fields are for local node slot recovery during
         * mount. */
        int dirty;
        struct ocfs2_dinode *local_alloc_copy;
+        struct ocfs2_quota_recovery *quota_rec;
        struct ocfs2_alloc_stats alloc_stats;
        char dev_str[20];               /* "major,minor" of the device */
@@ -333,6 +339,10 @@ struct ocfs2_super
 #define OCFS2_SB(sb)        ((struct ocfs2_super *)(sb)->s_fs_info)
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+                                         struct buffer_head *bh, int type);
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
        if (!S_ISREG(inode->i_mode))
@@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+                return 1;
+        return 0;
+}
 /* set / clear functions because cluster events can make these happen
 * in parallel so we want the transitions to be atomic. this also
 * means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DINODE(ptr)                                      \
        (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)  do {                    \
-        typeof(__di) ____di = (__di);                                   \
-        ocfs2_error((__sb),                                             \
-                "Dinode # %llu has bad signature %.*s",                 \
-                (unsigned long long)le64_to_cpu((____di)->i_blkno), 7,  \
-                (____di)->i_signature);                                 \
-} while (0)
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)                                \
        (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)    do {            \
-        typeof(__eb) ____eb = (__eb);                                   \
-        ocfs2_error((__sb),                                             \
-                "Extent Block # %llu has bad signature %.*s",           \
-                (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,  \
-                (____eb)->h_signature);                                 \
-} while (0)
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)                                  \
        (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)      do {            \
-        typeof(__gd) ____gd = (__gd);                                   \
-                ocfs2_error((__sb),                                     \
-                "Group Descriptor # %llu has bad signature %.*s",       \
-                (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
-                (____gd)->bg_signature);                                \
-} while (0)
 #define OCFS2_IS_VALID_XATTR_BLOCK(ptr)                                 \
        (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr)                                 \
+        (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
 static inline unsigned long ino_from_blkno(struct super_block *sb,
                                           u64 blkno)
 {
@@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_bit ext2_find_next_bit
 #endif  /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7d..c7ae45aaa36c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
 #define OCFS2_EXTENT_BLOCK_SIGNATURE    "EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE     "XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE     "DIRTRL1"
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)                       \
@@ -93,8 +94,11 @@
                                         | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
                                         | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
                                         | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
-                                         | OCFS2_FEATURE_INCOMPAT_XATTR)
+                                         | OCFS2_FEATURE_INCOMPAT_XATTR \
-#define OCFS2_FEATURE_RO_COMPAT_SUPP    OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
+                                         | OCFS2_FEATURE_INCOMPAT_META_ECC)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 /*
 * Heartbeat-only devices are missing journals and other files.  The
@@ -147,6 +151,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR            0x0200
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC         0x0800
 /*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
@@ -163,6 +170,12 @@
 */
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN       0x0001
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA        0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA        0x0004
 /* The byte offset of the first backup block will be 1G.
 * The following will be 4G, 16G, 64G, 256G and 1T.
 */
@@ -192,6 +205,7 @@
 #define OCFS2_HEARTBEAT_FL      (0x00000200)    /* Heartbeat area */
 #define OCFS2_CHAIN_FL          (0x00000400)    /* Chain allocator */
 #define OCFS2_DEALLOC_FL        (0x00000800)    /* Truncate log */
+#define OCFS2_QUOTA_FL          (0x00001000)    /* Quota file */
 /*
 * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
 #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
        HEARTBEAT_SYSTEM_INODE,
        GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+        USER_QUOTA_SYSTEM_INODE,
+        GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
        ORPHAN_DIR_SYSTEM_INODE,
        EXTENT_ALLOC_SYSTEM_INODE,
        INODE_ALLOC_SYSTEM_INODE,
        JOURNAL_SYSTEM_INODE,
        LOCAL_ALLOC_SYSTEM_INODE,
        TRUNCATE_LOG_SYSTEM_INODE,
+        LOCAL_USER_QUOTA_SYSTEM_INODE,
+        LOCAL_GROUP_QUOTA_SYSTEM_INODE,
        NUM_SYSTEM_INODES
 };
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        [SLOT_MAP_SYSTEM_INODE]                 = { "slot_map", 0, S_IFREG | 0644 },
        [HEARTBEAT_SYSTEM_INODE]                = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
        [GLOBAL_BITMAP_SYSTEM_INODE]            = { "global_bitmap", 0, S_IFREG | 0644 },
+        [USER_QUOTA_SYSTEM_INODE]               = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+        [GROUP_QUOTA_SYSTEM_INODE]              = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
        /* Slot-specific system inodes (one copy per slot) */
        [ORPHAN_DIR_SYSTEM_INODE]               = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
        [INODE_ALLOC_SYSTEM_INODE]              = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
        [JOURNAL_SYSTEM_INODE]                  = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
        [LOCAL_ALLOC_SYSTEM_INODE]              = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
-        [TRUNCATE_LOG_SYSTEM_INODE]             = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+        [TRUNCATE_LOG_SYSTEM_INODE]             = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+        [LOCAL_USER_QUOTA_SYSTEM_INODE]         = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+        [LOCAL_GROUP_QUOTA_SYSTEM_INODE]        = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 };
 /* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 #define OCFS2_RAW_SB(dinode)            (&((dinode)->id2.i_super))
 /*
+ * Block checking structure.  This is used in metadata to validate the
+ * contents.  If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/  __le32 bc_crc32e;       /* 802.3 Ethernet II CRC32 */
+        __le16 bc_ecc;          /* Single-error-correction parity vector.
+                                   This is a simple Hamming code dependant
+                                   on the blocksize.  OCFS2's maximum
+                                   blocksize, 4K, requires 16 parity bits,
+                                   so we fit in __le16. */
+        __le16 bc_reserved1;
+/*08*/
+};
+/*
 * On disk extent record for OCFS2
 * It describes a range of clusters on disk.
 *
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
 struct ocfs2_extent_block
 {
 /*00*/  __u8 h_signature[8];            /* Signature for verification */
-        __le64 h_reserved1;
+        struct ocfs2_block_check h_check;       /* Error checking */
 /*10*/  __le16 h_suballoc_slot;         /* Slot suballocator this
                                           extent_header belongs to */
        __le16 h_suballoc_bit;          /* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
                                           was set in i_flags */
        __le16 i_dyn_features;
        __le64 i_xattr_loc;
-/*80*/  __le64 i_reserved2[7];
+/*80*/  struct ocfs2_block_check i_check;       /* Error checking */
+/*88*/  __le64 i_reserved2[6];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
 } __attribute__ ((packed));
 /*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/  __le64          db_compat_inode;        /* Always zero. Was inode */
+        __le16          db_compat_rec_len;      /* Backwards compatible with
+                                                 * ocfs2_dir_entry. */
+        __u8            db_compat_name_len;     /* Always zero. Was name_len */
+        __u8            db_reserved0;
+        __le16          db_reserved1;
+        __le16          db_free_rec_len;        /* Size of largest empty hole
+                                                 * in this block. (unused) */
+/*10*/  __u8            db_signature[8];        /* Signature for verification */
+        __le64          db_reserved2;
+        __le64          db_free_next;           /* Next block in list (unused) */
+/*20*/  __le64          db_blkno;               /* Offset on disk, in blocks */
+        __le64          db_parent_dinode;       /* dinode which owns me, in
+                                                   blocks */
+/*30*/  struct ocfs2_block_check db_check;      /* Error checking */
+/*40*/
+};
+/*
 * On disk allocator group structure for OCFS2
 */
 struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
 /*20*/  __le64   bg_parent_dinode;       /* dinode which owns me, in
                                           blocks */
        __le64   bg_blkno;               /* Offset on disk, in blocks */
-/*30*/  __le64   bg_reserved2[2];
+/*30*/  struct ocfs2_block_check bg_check;      /* Error checking */
+        __le64   bg_reserved2;
 /*40*/  __u8    bg_bitmap[0];
 };
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
                                                   in this extent record,
                                                   only valid in the first
                                                   bucket. */
-        __le64  xh_csum;
+        struct ocfs2_block_check xh_check;      /* Error checking
+                                                   (Note, this is only
+                                                    used for xattr
+                                                    buckets.  A block uses
+                                                    xb_check and sets
+                                                    this field to zero.) */
        struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
                                        block group */
        __le32  xb_fs_generation;    /* Must match super block */
 /*10*/  __le64  xb_blkno;            /* Offset on disk, in blocks */
-        __le64  xb_csum;
+        struct ocfs2_block_check xb_check;      /* Error checking */
 /*20*/  __le16  xb_flags;            /* Indicates whether this block contains
                                        real xattr or a xattr tree. */
        __le16  xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
        return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
 }
+/*
+ *  On disk structures for global quota file
+ */
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+        0x0cf52470, /* USRQUOTA */ \
+        0x0cf52471  /* GRPQUOTA */ \
+}
+#define OCFS2_GLOBAL_QVERSIONS {\
+        0, \
+        0, \
+}
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+        __le32 dqh_magic;       /* Magic number identifying file */
+        __le32 dqh_version;     /* Quota format version */
+};
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/  __le32 dqi_bgrace;      /* Grace time for space softlimit excess */
+        __le32 dqi_igrace;      /* Grace time for inode softlimit excess */
+        __le32 dqi_syncms;      /* Time after which we sync local changes to
+                                 * global quota file */
+        __le32 dqi_blocks;      /* Number of blocks in quota file */
+/*10*/  __le32 dqi_free_blk;    /* First free block in quota file */
+        __le32 dqi_free_entry;  /* First block with free dquot entry in quota
+                                 * file */
+};
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/  __le32 dqb_id;          /* ID the structure belongs to */
+        __le32 dqb_use_count;   /* Number of nodes having reference to this structure */
+        __le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+/*10*/  __le64 dqb_isoftlimit;  /* preferred inode limit */
+        __le64 dqb_curinodes;   /* current # allocated inodes */
+/*20*/  __le64 dqb_bhardlimit;  /* absolute limit on disk space */
+        __le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+/*30*/  __le64 dqb_curspace;    /* current space occupied */
+        __le64 dqb_btime;       /* time limit for excessive disk use */
+/*40*/  __le64 dqb_itime;       /* time limit for excessive inode use */
+        __le64 dqb_pad1;
+/*50*/  __le64 dqb_pad2;
+};
+/*
+ *  On-disk structures for local quota file
+ */
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+        0x0cf524c0, /* USRQUOTA */ \
+        0x0cf524c1  /* GRPQUOTA */ \
+}
+#define OCFS2_LOCAL_QVERSIONS {\
+        0, \
+        0, \
+}
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN      0x0001  /* Quota file is empty (this should be after\
+                                 * quota has been cleanly turned off) */
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+        __le32 dqi_flags;       /* Flags for quota file */
+        __le32 dqi_chunks;      /* Number of chunks of quota structures
+                                 * with a bitmap */
+        __le32 dqi_blocks;      /* Number of blocks allocated for quota file */
+};
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+        __le32 dqc_free;        /* Number of free entries in the bitmap */
+        u8 dqc_bitmap[0];       /* Bitmap of entries in the corresponding
+                                 * chunk of quota file */
+};
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/  __le64 dqb_id;          /* id this quota applies to */
+        __le64 dqb_spacemod;    /* Change in the amount of used space */
+/*10*/  __le64 dqb_inodemod;    /* Change in the amount of used inodes */
+};
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+struct ocfs2_disk_dqtrailer {
+/*00*/  struct ocfs2_block_check dq_check;      /* Error checking */
+/*08*/  /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+                                                                 void *buf)
+{
+        char *ptr = buf;
+        ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+        return (struct ocfs2_disk_dqtrailer *)ptr;
+}
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f558..000000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_jbd_compat.h
- *
- * Compatibility defines for JBD.
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#ifndef OCFS2_JBD_COMPAT_H
-#define OCFS2_JBD_COMPAT_H
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# error Should not have been included
-#endif
-struct jbd2_inode {
-        unsigned int dummy;
-};
-#define JBD2_BARRIER                    JFS_BARRIER
-#define JBD2_DEFAULT_MAX_COMMIT_AGE     JBD_DEFAULT_MAX_COMMIT_AGE
-#define jbd2_journal_ack_err                    journal_ack_err
-#define jbd2_journal_clear_err                  journal_clear_err
-#define jbd2_journal_destroy                    journal_destroy
-#define jbd2_journal_dirty_metadata             journal_dirty_metadata
-#define jbd2_journal_errno                      journal_errno
-#define jbd2_journal_extend                     journal_extend
-#define jbd2_journal_flush                      journal_flush
-#define jbd2_journal_force_commit               journal_force_commit
-#define jbd2_journal_get_write_access           journal_get_write_access
-#define jbd2_journal_get_undo_access            journal_get_undo_access
-#define jbd2_journal_init_inode                 journal_init_inode
-#define jbd2_journal_invalidatepage             journal_invalidatepage
-#define jbd2_journal_load                       journal_load
-#define jbd2_journal_lock_updates               journal_lock_updates
-#define jbd2_journal_restart                    journal_restart
-#define jbd2_journal_start                      journal_start
-#define jbd2_journal_start_commit               journal_start_commit
-#define jbd2_journal_stop                       journal_stop
-#define jbd2_journal_try_to_free_buffers        journal_try_to_free_buffers
-#define jbd2_journal_unlock_updates             journal_unlock_updates
-#define jbd2_journal_wipe                       journal_wipe
-#define jbd2_log_wait_commit                    log_wait_commit
-static inline int jbd2_journal_file_inode(handle_t *handle,
-                                          struct jbd2_inode *inode)
-{
-        return 0;
-}
-static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
-                                                      loff_t new_size)
-{
-        return 0;
-}
-static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
-                                               struct inode *inode)
-{
-        return;
-}
-static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
-                                                  struct jbd2_inode *jinode)
-{
-        return;
-}
-#endif  /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f1..eb6f50c9ceca 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
        OCFS2_LOCK_TYPE_DENTRY,
        OCFS2_LOCK_TYPE_OPEN,
        OCFS2_LOCK_TYPE_FLOCK,
+        OCFS2_LOCK_TYPE_QINFO,
        OCFS2_NUM_LOCK_TYPES
 };
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
                case OCFS2_LOCK_TYPE_FLOCK:
                        c = 'F';
                        break;
+                case OCFS2_LOCK_TYPE_QINFO:
+                        c = 'Q';
+                        break;
                default:
                        c = '\0';
        }
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
        [OCFS2_LOCK_TYPE_OPEN] = "Open",
        [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+        [OCFS2_LOCK_TYPE_QINFO] = "Quota",
 };
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 000000000000..7365e2e08706
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+#include "ocfs2.h"
+/* Common stuff */
+/* id number of quota format */
+#define QFMT_OCFS2 3
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+        struct dquot dq_dquot;  /* Generic VFS dquot */
+        loff_t dq_local_off;    /* Offset in the local quota file */
+        struct ocfs2_quota_chunk *dq_chunk;     /* Chunk dquot is in */
+        unsigned int dq_use_count;      /* Number of nodes having reference to this entry in global quota file */
+        s64 dq_origspace;       /* Last globally synced space usage */
+        s64 dq_originodes;      /* Last globally synced inode usage */
+};
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+        struct list_head rc_list;       /* List of chunks */
+        int rc_chunk;                   /* Chunk number */
+        unsigned long *rc_bitmap;       /* Bitmap of entries to recover */
+};
+struct ocfs2_quota_recovery {
+        struct list_head r_list[MAXQUOTAS];     /* List of chunks to recover */
+};
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+        unsigned int dqi_type;          /* Quota type this structure describes */
+        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
+        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
+        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
+        unsigned int dqi_syncjiff;      /* Precomputed dqi_syncms in jiffies */
+        struct list_head dqi_chunk;     /* List of chunks */
+        struct inode *dqi_gqinode;      /* Global quota file inode */
+        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
+        struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
+        int dqi_gqi_count;              /* Number of holders of dqi_gqi_bh */
+        struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
+        struct buffer_head *dqi_ibh;    /* Buffer with information header */
+        struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
+        struct delayed_work dqi_sync_work;      /* Work for syncing dquots */
+        struct ocfs2_quota_recovery *dqi_rec;   /* Pointer to recovery
+                                                 * information, in case we
+                                                 * enable quotas on file
+                                                 * needing it */
+};
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+        return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+struct ocfs2_quota_chunk {
+        struct list_head qc_chunk;      /* List of quotafile chunks */
+        int qc_num;                     /* Number of quota chunk */
+        struct buffer_head *qc_headerbh;        /* Buffer head with chunk header */
+};
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+extern struct qtree_fmt_operations ocfs2_global_ops;
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+                                struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+                                struct ocfs2_quota_recovery *rec,
+                                int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+                         size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+                          const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+        return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+        return __ocfs2_sync_dquot(dquot, 1);
+}
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                           struct buffer_head **bh);
+extern struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+int ocfs2_quota_setup(void);
+void ocfs2_quota_shutdown(void);
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000000..6aff8f2d3e49
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,1025 @@
+/*
+ *  Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "quota.h"
+static struct workqueue_struct *ocfs2_quota_wq = NULL;
+static void qsync_work_fn(struct work_struct *work);
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        /* Update from disk only entries not set by the admin */
+        if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+                m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+                m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+        }
+        if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+                m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+        if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+                m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+                m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+        }
+        if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+                m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+        if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+                m->dqb_btime = le64_to_cpu(d->dqb_btime);
+        if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+                m->dqb_itime = le64_to_cpu(d->dqb_itime);
+        OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        d->dqb_id = cpu_to_le32(dquot->dq_id);
+        d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+        d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+        d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+        d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+        d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+        d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+        d->dqb_btime = cpu_to_le64(m->dqb_btime);
+        d->dqb_itime = cpu_to_le64(m->dqb_itime);
+}
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+        struct ocfs2_global_disk_dqblk *d = dp;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+                return 0;
+        return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+struct qtree_fmt_operations ocfs2_global_ops = {
+        .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+        .disk2mem_dqblk = ocfs2_global_disk2memdqb,
+        .is_id = ocfs2_global_is_id,
+};
+static int ocfs2_validate_quota_block(struct super_block *sb,
+                                      struct buffer_head *bh)
+{
+        struct ocfs2_disk_dqtrailer *dqt =
+                ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+        mlog(0, "Validating quota block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+                           struct buffer_head **bh)
+{
+        int rc = 0;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+                                    ocfs2_validate_quota_block);
+        if (rc)
+                mlog_errno(rc);
+        /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
+static int ocfs2_get_quota_block(struct inode *inode, int block,
+                                 struct buffer_head **bh)
+{
+        u64 pblock, pcount;
+        int err;
+        down_read(&OCFS2_I(inode)->ip_alloc_sem);
+        err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
+        up_read(&OCFS2_I(inode)->ip_alloc_sem);
+        if (err) {
+                mlog_errno(err);
+                return err;
+        }
+        *bh = sb_getblk(inode->i_sb, pblock);
+        if (!*bh) {
+                err = -EIO;
+                mlog_errno(err);
+        }
+        return err;;
+}
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+                         size_t len, loff_t off)
+{
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct inode *gqinode = oinfo->dqi_gqinode;
+        loff_t i_size = i_size_read(gqinode);
+        int offset = off & (sb->s_blocksize - 1);
+        sector_t blk = off >> sb->s_blocksize_bits;
+        int err = 0;
+        struct buffer_head *bh;
+        size_t toread, tocopy;
+        if (off > i_size)
+                return 0;
+        if (off + len > i_size)
+                len = i_size - off;
+        toread = len;
+        while (toread > 0) {
+                tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+                bh = NULL;
+                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                if (err) {
+                        mlog_errno(err);
+                        return err;
+                }
+                memcpy(data, bh->b_data + offset, tocopy);
+                brelse(bh);
+                offset = 0;
+                toread -= tocopy;
+                data += tocopy;
+                blk++;
+        }
+        return len;
+}
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+                          const char *data, size_t len, loff_t off)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct inode *gqinode = oinfo->dqi_gqinode;
+        int offset = off & (sb->s_blocksize - 1);
+        sector_t blk = off >> sb->s_blocksize_bits;
+        int err = 0, new = 0, ja_type;
+        struct buffer_head *bh = NULL;
+        handle_t *handle = journal_current_handle();
+        if (!handle) {
+                mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+                     "because transaction was not started.\n",
+                     (unsigned long long)off, (unsigned long long)len);
+                return -EIO;
+        }
+        if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+                WARN_ON(1);
+                len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+        }
+        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
+        if (gqinode->i_size < off + len) {
+                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                err = ocfs2_extend_no_holes(gqinode, off + len, off);
+                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                if (err < 0)
+                        goto out;
+                err = ocfs2_simple_size_update(gqinode,
+                                               oinfo->dqi_gqi_bh,
+                                               off + len);
+                if (err < 0)
+                        goto out;
+                new = 1;
+        }
+        /* Not rewriting whole block? */
+        if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+            !new) {
+                err = ocfs2_read_quota_block(gqinode, blk, &bh);
+                ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+        } else {
+                err = ocfs2_get_quota_block(gqinode, blk, &bh);
+                ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+        }
+        if (err) {
+                mlog_errno(err);
+                return err;
+        }
+        lock_buffer(bh);
+        if (new)
+                memset(bh->b_data, 0, sb->s_blocksize);
+        memcpy(bh->b_data + offset, data, len);
+        flush_dcache_page(bh->b_page);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        ocfs2_set_buffer_uptodate(gqinode, bh);
+        err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+        if (err < 0) {
+                brelse(bh);
+                goto out;
+        }
+        err = ocfs2_journal_dirty(handle, bh);
+        brelse(bh);
+        if (err < 0)
+                goto out;
+out:
+        if (err) {
+                mutex_unlock(&gqinode->i_mutex);
+                mlog_errno(err);
+                return err;
+        }
+        gqinode->i_version++;
+        ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+        mutex_unlock(&gqinode->i_mutex);
+        return len;
+}
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        int status;
+        struct buffer_head *bh = NULL;
+        status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+        if (status < 0)
+                return status;
+        spin_lock(&dq_data_lock);
+        if (!oinfo->dqi_gqi_count++)
+                oinfo->dqi_gqi_bh = bh;
+        else
+                WARN_ON(bh != oinfo->dqi_gqi_bh);
+        spin_unlock(&dq_data_lock);
+        return 0;
+}
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+        ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+        brelse(oinfo->dqi_gqi_bh);
+        spin_lock(&dq_data_lock);
+        if (!--oinfo->dqi_gqi_count)
+                oinfo->dqi_gqi_bh = NULL;
+        spin_unlock(&dq_data_lock);
+}
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+        struct inode *gqinode = NULL;
+        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+                                        GROUP_QUOTA_SYSTEM_INODE };
+        struct ocfs2_global_disk_dqinfo dinfo;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        int status;
+        mlog_entry_void();
+        /* Read global header */
+        gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+                        OCFS2_INVALID_SLOT);
+        if (!gqinode) {
+                mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+                        type);
+                status = -EINVAL;
+                goto out_err;
+        }
+        oinfo->dqi_gi.dqi_sb = sb;
+        oinfo->dqi_gi.dqi_type = type;
+        ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+        oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+        oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+        oinfo->dqi_gqi_bh = NULL;
+        oinfo->dqi_gqi_count = 0;
+        oinfo->dqi_gqinode = gqinode;
+        status = ocfs2_lock_global_qf(oinfo, 0);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+                                      sizeof(struct ocfs2_global_disk_dqinfo),
+                                      OCFS2_GLOBAL_INFO_OFF);
+        ocfs2_unlock_global_qf(oinfo, 0);
+        if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+                mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+                     status);
+                if (status >= 0)
+                        status = -EIO;
+                mlog_errno(status);
+                goto out_err;
+        }
+        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+        oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+        oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
+        oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+        oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+                                                OCFS2_QBLK_RESERVED_SPACE;
+        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+                           oinfo->dqi_syncjiff);
+out_err:
+        mlog_exit(status);
+        return status;
+}
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_global_disk_dqinfo dinfo;
+        ssize_t size;
+        spin_lock(&dq_data_lock);
+        info->dqi_flags &= ~DQF_INFO_DIRTY;
+        dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+        spin_unlock(&dq_data_lock);
+        dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+        dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+        dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+        dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+                                     sizeof(struct ocfs2_global_disk_dqinfo),
+                                     OCFS2_GLOBAL_INFO_OFF);
+        if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+                mlog(ML_ERROR, "Cannot write global quota info structure\n");
+                if (size >= 0)
+                        size = -EIO;
+                return size;
+        }
+        return 0;
+}
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+        int err;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        err = ocfs2_qinfo_lock(info, 1);
+        if (err < 0)
+                return err;
+        err = __ocfs2_global_write_info(sb, type);
+        ocfs2_qinfo_unlock(info, 1);
+        return err;
+}
+/* Read in information from global quota file and acquire a reference to it.
+ * dquot_acquire() has already started the transaction and locked quota file */
+int ocfs2_global_read_dquot(struct dquot *dquot)
+{
+        int err, err2, ex = 0;
+        struct ocfs2_mem_dqinfo *info =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        err = ocfs2_qinfo_lock(info, 0);
+        if (err < 0)
+                goto out;
+        err = qtree_read_dquot(&info->dqi_gi, dquot);
+        if (err < 0)
+                goto out_qlock;
+        OCFS2_DQUOT(dquot)->dq_use_count++;
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        if (!dquot->dq_off) {   /* No real quota entry? */
+                /* Upgrade to exclusive lock for allocation */
+                err = ocfs2_qinfo_lock(info, 1);
+                if (err < 0)
+                        goto out_qlock;
+                ex = 1;
+        }
+        err = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
+                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
+                if (!err)
+                        err = err2;
+        }
+out_qlock:
+        if (ex)
+                ocfs2_qinfo_unlock(info, 1);
+        ocfs2_qinfo_unlock(info, 0);
+out:
+        if (err < 0)
+                mlog_errno(err);
+        return err;
+}
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+        int err, err2;
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_global_disk_dqblk dqblk;
+        s64 spacechange, inodechange;
+        time_t olditime, oldbtime;
+        err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+                                   sizeof(struct ocfs2_global_disk_dqblk),
+                                   dquot->dq_off);
+        if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+                if (err >= 0) {
+                        mlog(ML_ERROR, "Short read from global quota file "
+                                       "(%u read)\n", err);
+                        err = -EIO;
+                }
+                goto out;
+        }
+        /* Update space and inode usage. Get also other information from
+         * global quota file so that we don't overwrite any changes there.
+         * We are */
+        spin_lock(&dq_data_lock);
+        spacechange = dquot->dq_dqb.dqb_curspace -
+                                        OCFS2_DQUOT(dquot)->dq_origspace;
+        inodechange = dquot->dq_dqb.dqb_curinodes -
+                                        OCFS2_DQUOT(dquot)->dq_originodes;
+        olditime = dquot->dq_dqb.dqb_itime;
+        oldbtime = dquot->dq_dqb.dqb_btime;
+        ocfs2_global_disk2memdqb(dquot, &dqblk);
+        mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+             dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+             dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+        if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+                dquot->dq_dqb.dqb_curspace += spacechange;
+        if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+                dquot->dq_dqb.dqb_curinodes += inodechange;
+        /* Set properly space grace time... */
+        if (dquot->dq_dqb.dqb_bsoftlimit &&
+            dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+                if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+                    oldbtime > 0) {
+                        if (dquot->dq_dqb.dqb_btime > 0)
+                                dquot->dq_dqb.dqb_btime =
+                                        min(dquot->dq_dqb.dqb_btime, oldbtime);
+                        else
+                                dquot->dq_dqb.dqb_btime = oldbtime;
+                }
+        } else {
+                dquot->dq_dqb.dqb_btime = 0;
+                clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+        }
+        /* Set properly inode grace time... */
+        if (dquot->dq_dqb.dqb_isoftlimit &&
+            dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+                if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+                    olditime > 0) {
+                        if (dquot->dq_dqb.dqb_itime > 0)
+                                dquot->dq_dqb.dqb_itime =
+                                        min(dquot->dq_dqb.dqb_itime, olditime);
+                        else
+                                dquot->dq_dqb.dqb_itime = olditime;
+                }
+        } else {
+                dquot->dq_dqb.dqb_itime = 0;
+                clear_bit(DQ_INODES_B, &dquot->dq_flags);
+        }
+        /* All information is properly updated, clear the flags */
+        __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+        __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        spin_unlock(&dq_data_lock);
+        err = ocfs2_qinfo_lock(info, freeing);
+        if (err < 0) {
+                mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+                               " (type=%d, id=%u)\n", dquot->dq_type,
+                               (unsigned)dquot->dq_id);
+                goto out;
+        }
+        if (freeing)
+                OCFS2_DQUOT(dquot)->dq_use_count--;
+        err = qtree_write_dquot(&info->dqi_gi, dquot);
+        if (err < 0)
+                goto out_qlock;
+        if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+                err = qtree_release_dquot(&info->dqi_gi, dquot);
+                if (info_dirty(sb_dqinfo(sb, type))) {
+                        err2 = __ocfs2_global_write_info(sb, type);
+                        if (!err)
+                                err = err2;
+                }
+        }
+out_qlock:
+        ocfs2_qinfo_unlock(info, freeing);
+out:
+        if (err < 0)
+                mlog_errno(err);
+        return err;
+}
+/*
+ *  Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+        handle_t *handle;
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        int status = 0;
+        mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+                   dquot->dq_type, type, sb->s_id);
+        if (type != dquot->dq_type)
+                goto out;
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+        status = ocfs2_sync_dquot(dquot);
+        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+        if (status < 0)
+                mlog_errno(status);
+        /* We have to write local structure as well... */
+        dquot_mark_dquot_dirty(dquot);
+        status = dquot_commit(dquot);
+        if (status < 0)
+                mlog_errno(status);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static void qsync_work_fn(struct work_struct *work)
+{
+        struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+                                                      struct ocfs2_mem_dqinfo,
+                                                      dqi_sync_work.work);
+        struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+                           oinfo->dqi_syncjiff);
+}
+/*
+ *  Wrappers for generic quota functions
+ */
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = dquot_commit(dquot);
+        ocfs2_commit_trans(osb, handle);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo;
+        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+                return 0;
+        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        /* We modify tree, leaf block, global info, local chunk header,
+         * global and local inode */
+        return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
+               2 * OCFS2_INODE_UPDATE_CREDITS;
+}
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb,
+                ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_release(dquot);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo;
+        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+        struct ocfs2_dinode *lfe, *gfe;
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+                return 0;
+        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
+        lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
+        /* We can extend local file + global file. In local file we
+         * can modify info, chunk header block and dquot block. In
+         * global file we can modify info, tree and leaf block */
+        return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
+               ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
+               3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+}
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+        handle_t *handle;
+        struct ocfs2_mem_dqinfo *oinfo =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+        int status = 0;
+        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+        /* We need an exclusive lock, because we're going to update use count
+         * and instantiate possibly new dquot structure */
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb,
+                ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_acquire(dquot);
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+        unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+                             (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+        int sync = 0;
+        int status;
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        handle_t *handle;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+        dquot_mark_dquot_dirty(dquot);
+        /* In case user set some limits, sync dquot immediately to global
+         * quota file so that information propagates quicker */
+        spin_lock(&dq_data_lock);
+        if (dquot->dq_flags & mask)
+                sync = 1;
+        spin_unlock(&dq_data_lock);
+        if (!sync) {
+                status = ocfs2_write_dquot(dquot);
+                goto out;
+        }
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = ocfs2_sync_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Now write updated local dquot structure */
+        status = dquot_commit(dquot);
+out_trans:
+        ocfs2_commit_trans(osb, handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+        handle_t *handle;
+        int status = 0;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        mlog_entry_void();
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out_ilock;
+        }
+        status = dquot_commit_info(sb, type);
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        mlog_exit(status);
+        return status;
+}
+/* This is difficult. We have to lock quota inode and start transaction
+ * in this function but we don't want to take the penalty of exlusive
+ * quota file lock when we are just going to use cached structures. So
+ * we just take read lock check whether we have dquot cached and if so,
+ * we don't have to take the write lock... */
+static int ocfs2_dquot_initialize(struct inode *inode, int type)
+{
+        handle_t *handle = NULL;
+        int status = 0;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo;
+        int exclusive = 0;
+        int cnt;
+        qid_t id;
+        mlog_entry_void();
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (type != -1 && cnt != type)
+                        continue;
+                if (!sb_has_quota_active(sb, cnt))
+                        continue;
+                oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                status = ocfs2_lock_global_qf(oinfo, 0);
+                if (status < 0)
+                        goto out;
+                /* This is just a performance optimization not a reliable test.
+                 * Since we hold an inode lock, noone can actually release
+                 * the structure until we are finished with initialization. */
+                if (inode->i_dquot[cnt] != NODQUOT) {
+                        ocfs2_unlock_global_qf(oinfo, 0);
+                        continue;
+                }
+                /* When we have inode lock, we know that no dquot_release() can
+                 * run and thus we can safely check whether we need to
+                 * read+modify global file to get quota information or whether
+                 * our node already has it. */
+                if (cnt == USRQUOTA)
+                        id = inode->i_uid;
+                else if (cnt == GRPQUOTA)
+                        id = inode->i_gid;
+                else
+                        BUG();
+                /* Obtain exclusion from quota off... */
+                down_write(&sb_dqopt(sb)->dqptr_sem);
+                exclusive = !dquot_is_cached(sb, id, cnt);
+                up_write(&sb_dqopt(sb)->dqptr_sem);
+                if (exclusive) {
+                        status = ocfs2_lock_global_qf(oinfo, 1);
+                        if (status < 0) {
+                                exclusive = 0;
+                                mlog_errno(status);
+                                goto out_ilock;
+                        }
+                        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                                        ocfs2_calc_qinit_credits(sb, cnt));
+                        if (IS_ERR(handle)) {
+                                status = PTR_ERR(handle);
+                                mlog_errno(status);
+                                goto out_ilock;
+                        }
+                }
+                dquot_initialize(inode, cnt);
+                if (exclusive) {
+                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                        ocfs2_unlock_global_qf(oinfo, 1);
+                }
+                ocfs2_unlock_global_qf(oinfo, 0);
+        }
+        mlog_exit(0);
+        return 0;
+out_ilock:
+        if (exclusive)
+                ocfs2_unlock_global_qf(oinfo, 1);
+        ocfs2_unlock_global_qf(oinfo, 0);
+out:
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_dquot_drop_slow(struct inode *inode)
+{
+        int status = 0;
+        int cnt;
+        int got_lock[MAXQUOTAS] = {0, 0};
+        handle_t *handle;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo;
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (!sb_has_quota_active(sb, cnt))
+                        continue;
+                oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                status = ocfs2_lock_global_qf(oinfo, 1);
+                if (status < 0)
+                        goto out;
+                got_lock[cnt] = 1;
+        }
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                        ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+                        ocfs2_calc_qinit_credits(sb, GRPQUOTA));
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        dquot_drop(inode);
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (got_lock[cnt]) {
+                        oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                        ocfs2_unlock_global_qf(oinfo, 1);
+                }
+        return status;
+}
+/* See the comment before ocfs2_dquot_initialize. */
+static int ocfs2_dquot_drop(struct inode *inode)
+{
+        int status = 0;
+        struct super_block *sb = inode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo;
+        int exclusive = 0;
+        int cnt;
+        int got_lock[MAXQUOTAS] = {0, 0};
+        mlog_entry_void();
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (!sb_has_quota_active(sb, cnt))
+                        continue;
+                oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                status = ocfs2_lock_global_qf(oinfo, 0);
+                if (status < 0)
+                        goto out;
+                got_lock[cnt] = 1;
+        }
+        /* Lock against anyone releasing references so that when when we check
+         * we know we are not going to be last ones to release dquot */
+        down_write(&sb_dqopt(sb)->dqptr_sem);
+        /* Urgh, this is a terrible hack :( */
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (inode->i_dquot[cnt] != NODQUOT &&
+                    atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
+                        exclusive = 1;
+                        break;
+                }
+        }
+        if (!exclusive)
+                dquot_drop_locked(inode);
+        up_write(&sb_dqopt(sb)->dqptr_sem);
+out:
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                if (got_lock[cnt]) {
+                        oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+                        ocfs2_unlock_global_qf(oinfo, 0);
+                }
+        /* In case we bailed out because we had to do expensive locking
+         * do it now... */
+        if (exclusive)
+                status = ocfs2_dquot_drop_slow(inode);
+        mlog_exit(status);
+        return status;
+}
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+        struct ocfs2_dquot *dquot =
+                                kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+        if (!dquot)
+                return NULL;
+        return &dquot->dq_dquot;
+}
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+        kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+struct dquot_operations ocfs2_quota_operations = {
+        .initialize     = ocfs2_dquot_initialize,
+        .drop           = ocfs2_dquot_drop,
+        .alloc_space    = dquot_alloc_space,
+        .alloc_inode    = dquot_alloc_inode,
+        .free_space     = dquot_free_space,
+        .free_inode     = dquot_free_inode,
+        .transfer       = dquot_transfer,
+        .write_dquot    = ocfs2_write_dquot,
+        .acquire_dquot  = ocfs2_acquire_dquot,
+        .release_dquot  = ocfs2_release_dquot,
+        .mark_dirty     = ocfs2_mark_dquot_dirty,
+        .write_info     = ocfs2_write_info,
+        .alloc_dquot    = ocfs2_alloc_dquot,
+        .destroy_dquot  = ocfs2_destroy_dquot,
+};
+int ocfs2_quota_setup(void)
+{
+        ocfs2_quota_wq = create_workqueue("o2quot");
+        if (!ocfs2_quota_wq)
+                return -ENOMEM;
+        return 0;
+}
+void ocfs2_quota_shutdown(void)
+{
+        if (ocfs2_quota_wq) {
+                flush_workqueue(ocfs2_quota_wq);
+                destroy_workqueue(ocfs2_quota_wq);
+                ocfs2_quota_wq = NULL;
+        }
+}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000000..07deec5e9721
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
+/*
+ *  Implementation of operations over local quota file
+ */
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+        return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+                sizeof(struct ocfs2_local_disk_dqblk));
+}
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+        return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+                 OCFS2_QBLK_RESERVED_SPACE) << 3) /
+               ol_quota_entries_per_block(sb);
+}
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+        return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+        /* 1 block for local quota file info, 1 block per chunk for chunk info */
+        return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+        return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+               ol_dqblk_block_off(sb, c, off);
+}
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+        return off >> sb->s_blocksize_bits;
+}
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+        return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+        int epb = ol_quota_entries_per_block(sb);
+        return ((off >> sb->s_blocksize_bits) -
+                        ol_quota_chunk_block(sb, c) - 1) * epb
+               + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+                 sizeof(struct ocfs2_local_disk_dqblk);
+}
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+                void (*modify)(struct buffer_head *, void *), void *private)
+{
+        struct super_block *sb = inode->i_sb;
+        handle_t *handle;
+        int status;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                return status;
+        }
+        status = ocfs2_journal_access_dq(handle, inode, bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                return status;
+        }
+        lock_buffer(bh);
+        modify(bh, private);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                ocfs2_commit_trans(OCFS2_SB(sb), handle);
+                return status;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                return status;
+        }
+        return 0;
+}
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+        unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+        unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+        unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+        unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+        unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+                                        GROUP_QUOTA_SYSTEM_INODE };
+        struct buffer_head *bh = NULL;
+        struct inode *linode = sb_dqopt(sb)->files[type];
+        struct inode *ginode = NULL;
+        struct ocfs2_disk_dqheader *dqhead;
+        int status, ret = 0;
+        /* First check whether we understand local quota file */
+        status = ocfs2_read_quota_block(linode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+                        type);
+                goto out_err;
+        }
+        dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+        if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+                mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+                        " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+                        lmagics[type], type);
+                goto out_err;
+        }
+        if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+                mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+                        " type=%d\n", le32_to_cpu(dqhead->dqh_version),
+                        lversions[type], type);
+                goto out_err;
+        }
+        brelse(bh);
+        bh = NULL;
+        /* Next check whether we understand global quota file */
+        ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+                                                OCFS2_INVALID_SLOT);
+        if (!ginode) {
+                mlog(ML_ERROR, "cannot get global quota file inode "
+                                "(type=%d)\n", type);
+                goto out_err;
+        }
+        /* Since the header is read only, we don't care about locking */
+        status = ocfs2_read_quota_block(ginode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read global quota file header "
+                                "(type=%d)\n", type);
+                goto out_err;
+        }
+        dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+        if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+                mlog(ML_ERROR, "global quota file magic does not match "
+                        "(%u != %u), type=%d\n",
+                        le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+                goto out_err;
+        }
+        if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+                mlog(ML_ERROR, "global quota file version does not match "
+                        "(%u != %u), type=%d\n",
+                        le32_to_cpu(dqhead->dqh_version), gversions[type],
+                        type);
+                goto out_err;
+        }
+        ret = 1;
+out_err:
+        brelse(bh);
+        iput(ginode);
+        return ret;
+}
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+        struct ocfs2_quota_chunk *pos, *next;
+        list_for_each_entry_safe(pos, next, head, qc_chunk) {
+                list_del(&pos->qc_chunk);
+                brelse(pos->qc_headerbh);
+                kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+        }
+}
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+                        struct ocfs2_local_disk_dqinfo *ldinfo,
+                        struct list_head *head)
+{
+        struct ocfs2_quota_chunk *newchunk;
+        int i, status;
+        INIT_LIST_HEAD(head);
+        for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+                newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+                if (!newchunk) {
+                        ocfs2_release_local_quota_bitmaps(head);
+                        return -ENOMEM;
+                }
+                newchunk->qc_num = i;
+                newchunk->qc_headerbh = NULL;
+                status = ocfs2_read_quota_block(inode,
+                                ol_quota_chunk_block(inode->i_sb, i),
+                                &newchunk->qc_headerbh);
+                if (status) {
+                        mlog_errno(status);
+                        kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+                        ocfs2_release_local_quota_bitmaps(head);
+                        return status;
+                }
+                list_add_tail(&newchunk->qc_chunk, head);
+        }
+        return 0;
+}
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+        struct mem_dqinfo *info = private;
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                OCFS2_LOCAL_INFO_OFF);
+        spin_lock(&dq_data_lock);
+        ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+        ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+        spin_unlock(&dq_data_lock);
+}
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+                                    struct ocfs2_local_disk_chunk *dchunk,
+                                    int chunk,
+                                    struct list_head *head)
+{
+        struct ocfs2_recovery_chunk *rc;
+        rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+        if (!rc)
+                return -ENOMEM;
+        rc->rc_chunk = chunk;
+        rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+        if (!rc->rc_bitmap) {
+                kfree(rc);
+                return -ENOMEM;
+        }
+        memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+               (ol_chunk_entries(sb) + 7) >> 3);
+        list_add_tail(&rc->rc_list, head);
+        return 0;
+}
+static void free_recovery_list(struct list_head *head)
+{
+        struct ocfs2_recovery_chunk *next;
+        struct ocfs2_recovery_chunk *rchunk;
+        list_for_each_entry_safe(rchunk, next, head, rc_list) {
+                list_del(&rchunk->rc_list);
+                kfree(rchunk->rc_bitmap);
+                kfree(rchunk);
+        }
+}
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+        int type;
+        for (type = 0; type < MAXQUOTAS; type++)
+                free_recovery_list(&(rec->r_list[type]));
+        kfree(rec);
+}
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+                                     struct ocfs2_local_disk_dqinfo *ldinfo,
+                                     int type,
+                                     struct list_head *head)
+{
+        struct super_block *sb = lqinode->i_sb;
+        struct buffer_head *hbh;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+        int status = 0;
+        for (i = 0; i < chunks; i++) {
+                hbh = NULL;
+                status = ocfs2_read_quota_block(lqinode,
+                                                ol_quota_chunk_block(sb, i),
+                                                &hbh);
+                if (status) {
+                        mlog_errno(status);
+                        break;
+                }
+                dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+                if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+                        status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+                brelse(hbh);
+                if (status < 0)
+                        break;
+        }
+        if (status < 0)
+                free_recovery_list(head);
+        return status;
+}
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+        int type;
+        struct ocfs2_quota_recovery *rec;
+        rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+        if (!rec)
+                return NULL;
+        for (type = 0; type < MAXQUOTAS; type++)
+                INIT_LIST_HEAD(&(rec->r_list[type]));
+        return rec;
+}
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+                                                struct ocfs2_super *osb,
+                                                int slot_num)
+{
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        struct super_block *sb = osb->sb;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct inode *lqinode;
+        struct buffer_head *bh;
+        int type;
+        int status = 0;
+        struct ocfs2_quota_recovery *rec;
+        mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+        rec = ocfs2_alloc_quota_recovery();
+        if (!rec)
+                return ERR_PTR(-ENOMEM);
+        /* First init... */
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                /* At this point, journal of the slot is already replayed so
+                 * we can trust metadata and data of the quota file */
+                lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+                if (!lqinode) {
+                        status = -ENOENT;
+                        goto out;
+                }
+                status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+                                               OCFS2_META_LOCK_RECOVERY);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_put;
+                }
+                /* Now read local header */
+                bh = NULL;
+                status = ocfs2_read_quota_block(lqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        mlog(ML_ERROR, "failed to read quota file info header "
+                                "(slot=%d type=%d)\n", slot_num, type);
+                        goto out_lock;
+                }
+                ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                        OCFS2_LOCAL_INFO_OFF);
+                status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+                brelse(bh);
+out_lock:
+                ocfs2_inode_unlock(lqinode, 1);
+out_put:
+                iput(lqinode);
+                if (status < 0)
+                        break;
+        }
+out:
+        if (status < 0) {
+                ocfs2_free_quota_recovery(rec);
+                rec = ERR_PTR(status);
+        }
+        return rec;
+}
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+                                          int type,
+                                          struct ocfs2_quota_recovery *rec)
+{
+        struct super_block *sb = lqinode->i_sb;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_local_disk_chunk *dchunk;
+        struct ocfs2_local_disk_dqblk *dqblk;
+        struct dquot *dquot;
+        handle_t *handle;
+        struct buffer_head *hbh = NULL, *qbh = NULL;
+        int status = 0;
+        int bit, chunk;
+        struct ocfs2_recovery_chunk *rchunk, *next;
+        qsize_t spacechange, inodechange;
+        mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+        status = ocfs2_lock_global_qf(oinfo, 1);
+        if (status < 0)
+                goto out;
+        list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+                chunk = rchunk->rc_chunk;
+                hbh = NULL;
+                status = ocfs2_read_quota_block(lqinode,
+                                                ol_quota_chunk_block(sb, chunk),
+                                                &hbh);
+                if (status) {
+                        mlog_errno(status);
+                        break;
+                }
+                dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+                for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+                        qbh = NULL;
+                        status = ocfs2_read_quota_block(lqinode,
+                                                ol_dqblk_block(sb, chunk, bit),
+                                                &qbh);
+                        if (status) {
+                                mlog_errno(status);
+                                break;
+                        }
+                        dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+                                ol_dqblk_block_off(sb, chunk, bit));
+                        dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+                        if (!dquot) {
+                                status = -EIO;
+                                mlog(ML_ERROR, "Failed to get quota structure "
+                                     "for id %u, type %d. Cannot finish quota "
+                                     "file recovery.\n",
+                                     (unsigned)le64_to_cpu(dqblk->dqb_id),
+                                     type);
+                                goto out_put_bh;
+                        }
+                        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                                                   OCFS2_QSYNC_CREDITS);
+                        if (IS_ERR(handle)) {
+                                status = PTR_ERR(handle);
+                                mlog_errno(status);
+                                goto out_put_dquot;
+                        }
+                        mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+                        spin_lock(&dq_data_lock);
+                        /* Add usage from quota entry into quota changes
+                         * of our node. Auxiliary variables are important
+                         * due to signedness */
+                        spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+                        inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+                        dquot->dq_dqb.dqb_curspace += spacechange;
+                        dquot->dq_dqb.dqb_curinodes += inodechange;
+                        spin_unlock(&dq_data_lock);
+                        /* We want to drop reference held by the crashed
+                         * node. Since we have our own reference we know
+                         * global structure actually won't be freed. */
+                        status = ocfs2_global_release_dquot(dquot);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_commit;
+                        }
+                        /* Release local quota file entry */
+                        status = ocfs2_journal_access_dq(handle, lqinode,
+                                        qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+                        if (status < 0) {
+                                mlog_errno(status);
+                                goto out_commit;
+                        }
+                        lock_buffer(qbh);
+                        WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+                        ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+                        le32_add_cpu(&dchunk->dqc_free, 1);
+                        unlock_buffer(qbh);
+                        status = ocfs2_journal_dirty(handle, qbh);
+                        if (status < 0)
+                                mlog_errno(status);
+out_commit:
+                        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+                        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_put_dquot:
+                        dqput(dquot);
+out_put_bh:
+                        brelse(qbh);
+                        if (status < 0)
+                                break;
+                }
+                brelse(hbh);
+                list_del(&rchunk->rc_list);
+                kfree(rchunk->rc_bitmap);
+                kfree(rchunk);
+                if (status < 0)
+                        break;
+        }
+        ocfs2_unlock_global_qf(oinfo, 1);
+out:
+        if (status < 0)
+                free_recovery_list(&(rec->r_list[type]));
+        mlog_exit(status);
+        return status;
+}
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+                                struct ocfs2_quota_recovery *rec,
+                                int slot_num)
+{
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        struct super_block *sb = osb->sb;
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct buffer_head *bh;
+        handle_t *handle;
+        int type;
+        int status = 0;
+        struct inode *lqinode;
+        unsigned int flags;
+        mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+        mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (list_empty(&(rec->r_list[type])))
+                        continue;
+                mlog(0, "Recovering quota in slot %d\n", slot_num);
+                lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+                if (!lqinode) {
+                        status = -ENOENT;
+                        goto out;
+                }
+                status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+                                                       OCFS2_META_LOCK_NOQUEUE);
+                /* Someone else is holding the lock? Then he must be
+                 * doing the recovery. Just skip the file... */
+                if (status == -EAGAIN) {
+                        mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+                             "because quota file is locked.\n", slot_num);
+                        status = 0;
+                        goto out_put;
+                } else if (status < 0) {
+                        mlog_errno(status);
+                        goto out_put;
+                }
+                /* Now read local header */
+                bh = NULL;
+                status = ocfs2_read_quota_block(lqinode, 0, &bh);
+                if (status) {
+                        mlog_errno(status);
+                        mlog(ML_ERROR, "failed to read quota file info header "
+                                "(slot=%d type=%d)\n", slot_num, type);
+                        goto out_lock;
+                }
+                ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                        OCFS2_LOCAL_INFO_OFF);
+                /* Is recovery still needed? */
+                flags = le32_to_cpu(ldinfo->dqi_flags);
+                if (!(flags & OLQF_CLEAN))
+                        status = ocfs2_recover_local_quota_file(lqinode,
+                                                                type,
+                                                                rec);
+                /* We don't want to mark file as clean when it is actually
+                 * active */
+                if (slot_num == osb->slot_num)
+                        goto out_bh;
+                /* Mark quota file as clean if we are recovering quota file of
+                 * some other node. */
+                handle = ocfs2_start_trans(osb, 1);
+                if (IS_ERR(handle)) {
+                        status = PTR_ERR(handle);
+                        mlog_errno(status);
+                        goto out_bh;
+                }
+                status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                                 OCFS2_JOURNAL_ACCESS_WRITE);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_trans;
+                }
+                lock_buffer(bh);
+                ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+                unlock_buffer(bh);
+                status = ocfs2_journal_dirty(handle, bh);
+                if (status < 0)
+                        mlog_errno(status);
+out_trans:
+                ocfs2_commit_trans(osb, handle);
+out_bh:
+                brelse(bh);
+out_lock:
+                ocfs2_inode_unlock(lqinode, 1);
+out_put:
+                iput(lqinode);
+                if (status < 0)
+                        break;
+        }
+out:
+        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+        kfree(rec);
+        return status;
+}
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+        struct ocfs2_local_disk_dqinfo *ldinfo;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        int status;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_quota_recovery *rec;
+        int locked = 0;
+        info->dqi_maxblimit = 0x7fffffffffffffffLL;
+        info->dqi_maxilimit = 0x7fffffffffffffffLL;
+        oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+        if (!oinfo) {
+                mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+                               " info.");
+                goto out_err;
+        }
+        info->dqi_priv = oinfo;
+        oinfo->dqi_type = type;
+        INIT_LIST_HEAD(&oinfo->dqi_chunk);
+        oinfo->dqi_rec = NULL;
+        oinfo->dqi_lqi_bh = NULL;
+        oinfo->dqi_ibh = NULL;
+        status = ocfs2_global_read_info(sb, type);
+        if (status < 0)
+                goto out_err;
+        status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        locked = 1;
+        /* Now read local header */
+        status = ocfs2_read_quota_block(lqinode, 0, &bh);
+        if (status) {
+                mlog_errno(status);
+                mlog(ML_ERROR, "failed to read quota file info header "
+                        "(type=%d)\n", type);
+                goto out_err;
+        }
+        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+                                                OCFS2_LOCAL_INFO_OFF);
+        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+        oinfo->dqi_ibh = bh;
+        /* We crashed when using local quota file? */
+        if (!(info->dqi_flags & OLQF_CLEAN)) {
+                rec = OCFS2_SB(sb)->quota_rec;
+                if (!rec) {
+                        rec = ocfs2_alloc_quota_recovery();
+                        if (!rec) {
+                                status = -ENOMEM;
+                                mlog_errno(status);
+                                goto out_err;
+                        }
+                        OCFS2_SB(sb)->quota_rec = rec;
+                }
+                status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+                if (status < 0) {
+                        mlog_errno(status);
+                        goto out_err;
+                }
+        }
+        status = ocfs2_load_local_quota_bitmaps(lqinode,
+                                                ldinfo,
+                                                &oinfo->dqi_chunk);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        /* Now mark quota file as used */
+        info->dqi_flags &= ~OLQF_CLEAN;
+        status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        return 0;
+out_err:
+        if (oinfo) {
+                iput(oinfo->dqi_gqinode);
+                ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+                ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+                brelse(oinfo->dqi_lqi_bh);
+                if (locked)
+                        ocfs2_inode_unlock(lqinode, 1);
+                ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+                kfree(oinfo);
+        }
+        brelse(bh);
+        return -1;
+}
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+                                                ->dqi_ibh;
+        int status;
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+                                 info);
+        if (status < 0) {
+                mlog_errno(status);
+                return -1;
+        }
+        return 0;
+}
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int mark_clean = 1, len;
+        int status;
+        /* At this point we know there are no more dquots and thus
+         * even if there's some sync in the pdflush queue, it won't
+         * find any dquots and return without doing anything */
+        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+        iput(oinfo->dqi_gqinode);
+        ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+        ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+        list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+                dchunk = (struct ocfs2_local_disk_chunk *)
+                                        (chunk->qc_headerbh->b_data);
+                if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+                        len = ol_chunk_entries(sb);
+                } else {
+                        len = (oinfo->dqi_blocks -
+                               ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+                              * ol_quota_entries_per_block(sb);
+                }
+                /* Not all entries free? Bug! */
+                if (le32_to_cpu(dchunk->dqc_free) != len) {
+                        mlog(ML_ERROR, "releasing quota file with used "
+                                        "entries (type=%d)\n", type);
+                        mark_clean = 0;
+                }
+        }
+        ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+        /* dqonoff_mutex protects us against racing with recovery thread... */
+        if (oinfo->dqi_rec) {
+                ocfs2_free_quota_recovery(oinfo->dqi_rec);
+                mark_clean = 0;
+        }
+        if (!mark_clean)
+                goto out;
+        /* Mark local file as clean */
+        info->dqi_flags |= OLQF_CLEAN;
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+                                 oinfo->dqi_ibh,
+                                 olq_update_info,
+                                 info);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+        brelse(oinfo->dqi_ibh);
+        brelse(oinfo->dqi_lqi_bh);
+        kfree(oinfo);
+        return 0;
+}
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+        struct ocfs2_dquot *od = private;
+        struct ocfs2_local_disk_dqblk *dqblk;
+        struct super_block *sb = od->dq_dquot.dq_sb;
+        dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+                + ol_dqblk_block_offset(sb, od->dq_local_off));
+        dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+        spin_lock(&dq_data_lock);
+        dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+                                          od->dq_origspace);
+        dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+                                          od->dq_originodes);
+        spin_unlock(&dq_data_lock);
+        mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+             od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+             (long long)le64_to_cpu(dqblk->dqb_inodemod));
+}
+/* Write dquot to local quota file */
+static int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        struct buffer_head *bh = NULL;
+        int status;
+        status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+                                    ol_dqblk_file_block(sb, od->dq_local_off),
+                                    &bh);
+        if (status) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+                                 olq_set_dquot, od);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        brelse(bh);
+        return status;
+}
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+                                                       int type,
+                                                       int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int found = 0, len;
+        list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+                dchunk = (struct ocfs2_local_disk_chunk *)
+                                                chunk->qc_headerbh->b_data;
+                if (le32_to_cpu(dchunk->dqc_free) > 0) {
+                        found = 1;
+                        break;
+                }
+        }
+        if (!found)
+                return NULL;
+        if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+                len = ol_chunk_entries(sb);
+        } else {
+                len = (oinfo->dqi_blocks -
+                       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+                      * ol_quota_entries_per_block(sb);
+        }
+        found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+        /* We failed? */
+        if (found == len) {
+                mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+                     " entries free (type=%d)\n", chunk->qc_num,
+                     le32_to_cpu(dchunk->dqc_free), type);
+                return ERR_PTR(-EIO);
+        }
+        *offset = found;
+        return chunk;
+}
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+                                                        struct super_block *sb,
+                                                        int type,
+                                                        int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_quota_chunk *chunk = NULL;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int status;
+        handle_t *handle;
+        struct buffer_head *bh = NULL;
+        u64 p_blkno;
+        /* We are protected by dqio_sem so no locking needed */
+        status = ocfs2_extend_no_holes(lqinode,
+                                       lqinode->i_size + 2 * sb->s_blocksize,
+                                       lqinode->i_size);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+                                          lqinode->i_size + 2 * sb->s_blocksize);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+        if (!chunk) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        bh = sb_getblk(sb, p_blkno);
+        if (!bh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(bh);
+        dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+        memset(dchunk->dqc_bitmap, 0,
+               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+               OCFS2_QBLK_RESERVED_SPACE);
+        set_buffer_uptodate(bh);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        oinfo->dqi_blocks += 2;
+        oinfo->dqi_chunks++;
+        status = ocfs2_local_write_info(sb, type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+        chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+                                   struct ocfs2_quota_chunk,
+                                   qc_chunk)->qc_num + 1;
+        chunk->qc_headerbh = bh;
+        *offset = 0;
+        return chunk;
+out_trans:
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        brelse(bh);
+        kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+        return ERR_PTR(status);
+}
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+                                                       struct super_block *sb,
+                                                       int type,
+                                                       int *offset)
+{
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+        struct ocfs2_quota_chunk *chunk;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_local_disk_chunk *dchunk;
+        int epb = ol_quota_entries_per_block(sb);
+        unsigned int chunk_blocks;
+        int status;
+        handle_t *handle;
+        if (list_empty(&oinfo->dqi_chunk))
+                return ocfs2_local_quota_add_chunk(sb, type, offset);
+        /* Is the last chunk full? */
+        chunk = list_entry(oinfo->dqi_chunk.prev,
+                        struct ocfs2_quota_chunk, qc_chunk);
+        chunk_blocks = oinfo->dqi_blocks -
+                        ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+        if (ol_chunk_blocks(sb) == chunk_blocks)
+                return ocfs2_local_quota_add_chunk(sb, type, offset);
+        /* We are protected by dqio_sem so no locking needed */
+        status = ocfs2_extend_no_holes(lqinode,
+                                       lqinode->i_size + sb->s_blocksize,
+                                       lqinode->i_size);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+                                          lqinode->i_size + sb->s_blocksize);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+                                 OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+        lock_buffer(chunk->qc_headerbh);
+        le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+        unlock_buffer(chunk->qc_headerbh);
+        status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        oinfo->dqi_blocks++;
+        status = ocfs2_local_write_info(sb, type);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        *offset = chunk_blocks * epb;
+        return chunk;
+out_trans:
+        ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+        return ERR_PTR(status);
+}
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+        int *offset = private;
+        struct ocfs2_local_disk_chunk *dchunk;
+        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+        le32_add_cpu(&dchunk->dqc_free, -1);
+}
+/* Create dquot in the local file for given id */
+static int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+        struct super_block *sb = dquot->dq_sb;
+        int type = dquot->dq_type;
+        struct inode *lqinode = sb_dqopt(sb)->files[type];
+        struct ocfs2_quota_chunk *chunk;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        int offset;
+        int status;
+        chunk = ocfs2_find_free_entry(sb, type, &offset);
+        if (!chunk) {
+                chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+                if (IS_ERR(chunk))
+                        return PTR_ERR(chunk);
+        } else if (IS_ERR(chunk)) {
+                return PTR_ERR(chunk);
+        }
+        od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+        od->dq_chunk = chunk;
+        /* Initialize dquot structure on disk */
+        status = ocfs2_local_write_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        /* Mark structure as allocated */
+        status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+                                 &offset);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+out:
+        return status;
+}
+/* Create entry in local file for dquot, load data from the global file */
+static int ocfs2_local_read_dquot(struct dquot *dquot)
+{
+        int status;
+        mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
+        status = ocfs2_global_read_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        /* Now create entry in the local quota file */
+        status = ocfs2_create_local_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_err;
+        }
+        mlog_exit(0);
+        return 0;
+out_err:
+        mlog_exit(status);
+        return status;
+}
+/* Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and obtained exclusive lock for global
+ * quota file. */
+static int ocfs2_local_release_dquot(struct dquot *dquot)
+{
+        int status;
+        int type = dquot->dq_type;
+        struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+        struct super_block *sb = dquot->dq_sb;
+        struct ocfs2_local_disk_chunk *dchunk;
+        int offset;
+        handle_t *handle = journal_current_handle();
+        BUG_ON(!handle);
+        /* First write all local changes to global file */
+        status = ocfs2_global_release_dquot(dquot);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+                        od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+                                             od->dq_local_off);
+        dchunk = (struct ocfs2_local_disk_chunk *)
+                        (od->dq_chunk->qc_headerbh->b_data);
+        /* Mark structure as freed */
+        lock_buffer(od->dq_chunk->qc_headerbh);
+        ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+        le32_add_cpu(&dchunk->dqc_free, 1);
+        unlock_buffer(od->dq_chunk->qc_headerbh);
+        status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        status = 0;
+out:
+        /* Clear the read bit so that next time someone uses this
+         * dquot he reads fresh info from disk and allocates local
+         * dquot structure */
+        clear_bit(DQ_READ_B, &dquot->dq_flags);
+        return status;
+}
+static struct quota_format_ops ocfs2_format_ops = {
+        .check_quota_file       = ocfs2_local_check_quota_file,
+        .read_file_info         = ocfs2_local_read_info,
+        .write_file_info        = ocfs2_global_write_info,
+        .free_file_info         = ocfs2_local_free_info,
+        .read_dqblk             = ocfs2_local_read_dquot,
+        .commit_dqblk           = ocfs2_local_write_dquot,
+        .release_dqblk          = ocfs2_local_release_dquot,
+};
+struct quota_format_type ocfs2_quota_format = {
+        .qf_fmt_id = QFMT_OCFS2,
+        .qf_ops = &ocfs2_format_ops,
+        .qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a7..424adaa5f900 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
        mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
                   new_clusters, first_new_cluster);
-        ret = ocfs2_journal_access(handle, bm_inode, group_bh,
+        ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
        }
        /* update the inode accordingly. */
-        ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
+        ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
        fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+        /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+         * so any corruption is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
                                 ocfs2_group_bitmap_size(osb->sb) * 8) {
                mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
                goto out_unlock;
        }
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
-                ret = -EIO;
-                goto out_unlock;
-        }
        first_new_cluster = le32_to_cpu(fe->i_clusters);
        lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
                                              first_new_cluster - 1);
-        ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
+        ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+                                          &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_unlock;
        }
        group = (struct ocfs2_group_desc *)group_bh->b_data;
-        ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_unlock;
-        }
        cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
        if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
                le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
                                 struct buffer_head *group_bh)
 {
        int ret;
-        struct ocfs2_group_desc *gd;
+        struct ocfs2_group_desc *gd =
+                (struct ocfs2_group_desc *)group_bh->b_data;
        u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
-        unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
-                                le16_to_cpu(di->id2.i_chain.cl_bpc);
-        gd = (struct ocfs2_group_desc *)group_bh->b_data;
+        ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+        if (ret)
+                goto out;
-        ret = -EIO;
+        ret = -EINVAL;
-        if (!OCFS2_IS_VALID_GROUP_DESC(gd))
+        if (le16_to_cpu(gd->bg_chain) != input->chain)
-                mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno));
-        else if (di->i_blkno != gd->bg_parent_dinode)
-                mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
-                     "pointer (%llu, expected %llu)\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-                     (unsigned long long)le64_to_cpu(di->i_blkno));
-        else if (le16_to_cpu(gd->bg_bits) > max_bits)
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits));
-        else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-                     "claims that %u are free\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits),
-                     le16_to_cpu(gd->bg_free_bits_count));
-        else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
-                mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-                     "max bitmap bits of %u\n",
-                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                     le16_to_cpu(gd->bg_bits),
-                     8 * le16_to_cpu(gd->bg_size));
-        else if (le16_to_cpu(gd->bg_chain) != input->chain)
                mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
                     "while input has %u set.\n",
                     (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
        else
                ret = 0;
+out:
        return ret;
 }
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
        cl = &fe->id2.i_chain;
        cr = &cl->cl_recs[input->chain];
-        ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
+        ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
                goto out_commit;
        }
-        ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
+        ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f8508..40661e7824e9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
         * this is not true, the read of -1 (UINT64_MAX) will fail.
         */
        ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
-                                OCFS2_BH_IGNORE_CACHE);
+                                OCFS2_BH_IGNORE_CACHE, NULL);
        if (ret == 0) {
                spin_lock(&osb->osb_lock);
                ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
                bh = NULL;  /* Acquire a fresh bh */
                status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
-                                           OCFS2_BH_IGNORE_CACHE);
+                                           OCFS2_BH_IGNORE_CACHE, NULL);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b57..a69628603e18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
        return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 }
-/* somewhat more expensive than our other checks, so use sparingly. */
+#define do_error(fmt, ...)                                              \
-int ocfs2_check_group_descriptor(struct super_block *sb,
+        do{                                                             \
-                                 struct ocfs2_dinode *di,
+                if (clean_error)                                        \
-                                 struct ocfs2_group_desc *gd)
+                        mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);        \
+                else                                                    \
+                        ocfs2_error(sb, fmt, ##__VA_ARGS__);            \
+        } while (0)
+static int ocfs2_validate_gd_self(struct super_block *sb,
+                                  struct buffer_head *bh,
+                                  int clean_error)
 {
-        unsigned int max_bits;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
+                do_error("Group descriptor #%llu has bad signature %.*s",
-                return -EIO;
+                         (unsigned long long)bh->b_blocknr, 7,
+                         gd->bg_signature);
+                return -EINVAL;
        }
+        if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+                do_error("Group descriptor #%llu has an invalid bg_blkno "
+                         "of %llu",
+                         (unsigned long long)bh->b_blocknr,
+                         (unsigned long long)le64_to_cpu(gd->bg_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+                do_error("Group descriptor #%llu has an invalid "
+                         "fs_generation of #%u",
+                         (unsigned long long)bh->b_blocknr,
+                         le32_to_cpu(gd->bg_generation));
+                return -EINVAL;
+        }
+        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+                do_error("Group descriptor #%llu has bit count %u but "
+                         "claims that %u are free",
+                         (unsigned long long)bh->b_blocknr,
+                         le16_to_cpu(gd->bg_bits),
+                         le16_to_cpu(gd->bg_free_bits_count));
+                return -EINVAL;
+        }
+        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+                do_error("Group descriptor #%llu has bit count %u but "
+                         "max bitmap bits of %u",
+                         (unsigned long long)bh->b_blocknr,
+                         le16_to_cpu(gd->bg_bits),
+                         8 * le16_to_cpu(gd->bg_size));
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+                                    struct ocfs2_dinode *di,
+                                    struct buffer_head *bh,
+                                    int clean_error)
+{
+        unsigned int max_bits;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
        if (di->i_blkno != gd->bg_parent_dinode) {
-                ocfs2_error(sb, "Group descriptor # %llu has bad parent "
+                do_error("Group descriptor #%llu has bad parent "
-                            "pointer (%llu, expected %llu)",
+                         "pointer (%llu, expected %llu)",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+                         (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-                            (unsigned long long)le64_to_cpu(di->i_blkno));
+                         (unsigned long long)le64_to_cpu(di->i_blkno));
-                return -EIO;
+                return -EINVAL;
        }
        max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
        if (le16_to_cpu(gd->bg_bits) > max_bits) {
-                ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
+                do_error("Group descriptor #%llu has bit count of %u",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            le16_to_cpu(gd->bg_bits));
+                         le16_to_cpu(gd->bg_bits));
-                return -EIO;
+                return -EINVAL;
        }
        if (le16_to_cpu(gd->bg_chain) >=
            le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
-                ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
+                do_error("Group descriptor #%llu has bad chain %u",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+                         (unsigned long long)bh->b_blocknr,
-                            le16_to_cpu(gd->bg_chain));
+                         le16_to_cpu(gd->bg_chain));
-                return -EIO;
+                return -EINVAL;
        }
-        if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+        return 0;
-                ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
+}
-                            "claims that %u are free",
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
-                            le16_to_cpu(gd->bg_bits),
-                            le16_to_cpu(gd->bg_free_bits_count));
-                return -EIO;
-        }
-        if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+#undef do_error
-                ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-                            "max bitmap bits of %u",
+/*
-                            (unsigned long long)le64_to_cpu(gd->bg_blkno),
+ * This version only prints errors.  It does not fail the filesystem, and
-                            le16_to_cpu(gd->bg_bits),
+ * exists only for resize.
-                            8 * le16_to_cpu(gd->bg_size));
+ */
-                return -EIO;
+int ocfs2_check_group_descriptor(struct super_block *sb,
+                                 struct ocfs2_dinode *di,
+                                 struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+        if (rc) {
+                mlog(ML_ERROR,
+                     "Checksum failed for group descriptor %llu\n",
+                     (unsigned long long)bh->b_blocknr);
+        } else
+                rc = ocfs2_validate_gd_self(sb, bh, 1);
+        if (!rc)
+                rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+        return rc;
+}
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+                                           struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+        mlog(0, "Validating group descriptor %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+        if (rc)
+                return rc;
+        /*
+         * Errors after here are fatal.
+         */
+        return ocfs2_validate_gd_self(sb, bh, 0);
+}
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+                                u64 gd_blkno, struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+                              ocfs2_validate_group_descriptor);
+        if (rc)
+                goto out;
+        rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+        if (rc) {
+                brelse(tmp);
+                goto out;
        }
-        return 0;
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!*bh)
+                *bh = tmp;
+out:
+        return rc;
 }
 static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
                goto bail;
        }
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_gd(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      bg_bh,
+                                         bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_CREATE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
        bg = (struct ocfs2_group_desc *) bg_bh->b_data;
-        status = ocfs2_journal_access(handle, alloc_inode,
+        status = ocfs2_journal_access_di(handle, alloc_inode,
-                                      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+                                         bh, OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
        ac->ac_alloc_slot = slot;
        fe = (struct ocfs2_dinode *) bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+        /* The bh was validated by the inode read inside
-                status = -EIO;
+         * ocfs2_inode_lock().  Any corruption is a code bug. */
-                goto bail;
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-        }
        if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
                ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
                            (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
        int offset, start, found, status = 0;
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* Callers got this descriptor from
-                OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                return -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-        }
        found = start = best_offset = best_size = 0;
        bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* All callers get the descriptor via
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto bail;
-        }
        BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
        mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
        if (ocfs2_is_cluster_bitmap(alloc_inode))
                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_gd(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      group_bh,
+                                         group_bh,
-                                      journal_type);
+                                         journal_type);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
        struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
        struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
+        /* The caller got these descriptors from
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto out;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
-        }
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-                status = -EIO;
-                goto out;
-        }
-        if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
-                status = -EIO;
-                goto out;
-        }
        mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
             (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
        bg_ptr = le64_to_cpu(bg->bg_next_group);
        prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
-        status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
                goto out_rollback;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
                goto out_rollback;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
+        status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
                bg->bg_next_group = cpu_to_le64(bg_ptr);
                prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
        }
-out:
        mlog_exit(status);
        return status;
 }
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
        struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
        u16 found;
        struct buffer_head *group_bh = NULL;
        struct ocfs2_group_desc *gd;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
        struct inode *alloc_inode = ac->ac_inode;
-        ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
+        ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+                                          &group_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
        gd = (struct ocfs2_group_desc *) group_bh->b_data;
-        if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
-                ret = -EIO;
-                goto out;
-        }
        ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
                                  ac->ac_max_block, bit_off, &found);
        if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
             bits_wanted, chain,
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
-        status = ocfs2_read_block(alloc_inode,
+        status = ocfs2_read_group_descriptor(alloc_inode, fe,
-                                  le64_to_cpu(cl->cl_recs[chain].c_blkno),
+                                             le64_to_cpu(cl->cl_recs[chain].c_blkno),
-                                  &group_bh);
+                                             &group_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        bg = (struct ocfs2_group_desc *) group_bh->b_data;
-        status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
        status = -ENOSPC;
        /* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
                next_group = le64_to_cpu(bg->bg_next_group);
                prev_group_bh = group_bh;
                group_bh = NULL;
-                status = ocfs2_read_block(alloc_inode,
+                status = ocfs2_read_group_descriptor(alloc_inode, fe,
-                                          next_group, &group_bh);
+                                                     next_group, &group_bh);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail;
                }
                bg = (struct ocfs2_group_desc *) group_bh->b_data;
-                status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-                if (status) {
-                        mlog_errno(status);
-                        goto bail;
-                }
        }
        if (status < 0) {
                if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
        /* Ok, claim our bits now: set the info on dinode, chainlist
         * and then the group */
-        status = ocfs2_journal_access(handle,
+        status = ocfs2_journal_access_di(handle,
-                                      alloc_inode,
+                                         alloc_inode,
-                                      ac->ac_bh,
+                                         ac->ac_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
        BUG_ON(!ac->ac_bh);
        fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
-                OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
+        /* The bh was validated by the inode read during
-                status = -EIO;
+         * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
-                goto bail;
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
-        }
        if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
            le32_to_cpu(fe->id1.bitmap1.i_total)) {
                ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
+        /* The caller got this descriptor from
-                OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
+         * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
-                status = -EIO;
+        BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
-                goto bail;
-        }
        mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
        if (ocfs2_is_cluster_bitmap(alloc_inode))
                journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-        status = ocfs2_journal_access(handle, alloc_inode, group_bh,
+        status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
-                                      journal_type);
+                                         journal_type);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
        mlog_entry_void();
-        if (!OCFS2_IS_VALID_DINODE(fe)) {
+        /* The alloc_bh comes from ocfs2_free_dinode() or
-                OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
+         * ocfs2_free_clusters().  The callers have all locked the
-                status = -EIO;
+         * allocator and gotten alloc_bh from the lock call.  This
-                goto bail;
+         * validates the dinode buffer.  Any corruption that has happended
-        }
+         * is a code bug. */
+        BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
        BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
        mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
             (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
             (unsigned long long)bg_blkno, start_bit);
-        status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
+        status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+                                             &group_bh);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        group = (struct ocfs2_group_desc *) group_bh->b_data;
-        status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
-        if (status) {
-                mlog_errno(status);
-                goto bail;
-        }
        BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
        status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
                goto bail;
        }
-        status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
+        status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f450..e3c13c77f9e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 * and return that block offset. */
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
-/* somewhat more expensive than our other checks, so use sparingly. */
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem.  A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function.  This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
 int ocfs2_check_group_descriptor(struct super_block *sb,
                                 struct ocfs2_dinode *di,
-                                 struct ocfs2_group_desc *gd);
+                                 struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+                                u64 gd_blkno, struct buffer_head **bh);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
                          u32 clusters_to_add, u32 extents_to_split,
                          struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78cf..43ed11345b59 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/quotaops.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "ocfs1_fs_compat.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
 #include "extent_map.h"
@@ -65,10 +67,13 @@
 #include "uptodate.h"
 #include "ver.h"
 #include "xattr.h"
+#include "quota.h"
 #include "buffer_head_io.h"
 static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
 /* OCFS2 needs to schedule several differnt types of work which
 * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
 static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
 static const struct super_operations ocfs2_sops = {
        .statfs         = ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
        .put_super      = ocfs2_put_super,
        .remount_fs     = ocfs2_remount,
        .show_options   = ocfs2_show_options,
+        .quota_read     = ocfs2_quota_read,
+        .quota_write    = ocfs2_quota_write,
 };
 enum {
@@ -158,6 +168,10 @@ enum {
        Opt_user_xattr,
        Opt_nouser_xattr,
        Opt_inode64,
+        Opt_acl,
+        Opt_noacl,
+        Opt_usrquota,
+        Opt_grpquota,
        Opt_err,
 };
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
        {Opt_user_xattr, "user_xattr"},
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_inode64, "inode64"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_usrquota, "usrquota"},
+        {Opt_grpquota, "grpquota"},
        {Opt_err, NULL}
 };
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
        return 0;
 }
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+            && (ino == USER_QUOTA_SYSTEM_INODE
+                || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+                return 0;
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+            && (ino == GROUP_QUOTA_SYSTEM_INODE
+                || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+                return 0;
+        return 1;
+}
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 {
        struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
        for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
             i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+                if (!ocfs2_need_system_inode(osb, i))
+                        continue;
                new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                if (!new) {
                        ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
        for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
             i < NUM_SYSTEM_INODES;
             i++) {
+                if (!ocfs2_need_system_inode(osb, i))
+                        continue;
                new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
                if (!new) {
                        ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        /* We're going to/from readonly mode. */
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+                /* Disable quota accounting before remounting RO */
+                if (*flags & MS_RDONLY) {
+                        ret = ocfs2_susp_quotas(osb, 0);
+                        if (ret < 0)
+                                goto out;
+                }
                /* Lock here so the check of HARD_RO and the potential
                 * setting of SOFT_RO is atomic. */
                spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
                }
 unlock_osb:
                spin_unlock(&osb->osb_lock);
+                /* Enable quota accounting after remounting RW */
+                if (!ret && !(*flags & MS_RDONLY)) {
+                        if (sb_any_quota_suspended(sb))
+                                ret = ocfs2_susp_quotas(osb, 1);
+                        else
+                                ret = ocfs2_enable_quotas(osb);
+                        if (ret < 0) {
+                                /* Return back changes... */
+                                spin_lock(&osb->osb_lock);
+                                sb->s_flags |= MS_RDONLY;
+                                osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+                                spin_unlock(&osb->osb_lock);
+                                goto out;
+                        }
+                }
        }
        if (!ret) {
                /* Only save off the new mount options in case of a successful
                 * remount. */
+                if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+                        parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                osb->s_mount_opt = parsed_options.mount_opt;
                osb->s_atime_quantum = parsed_options.atime_quantum;
                osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
        return 0;
 }
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+        int type;
+        struct super_block *sb = osb->sb;
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        int status = 0;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                if (unsuspend)
+                        status = vfs_quota_enable(
+                                        sb_dqopt(sb)->files[type],
+                                        type, QFMT_OCFS2,
+                                        DQUOT_SUSPENDED);
+                else
+                        status = vfs_quota_disable(sb, type,
+                                                   DQUOT_SUSPENDED);
+                if (status < 0)
+                        break;
+        }
+        if (status < 0)
+                mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+                     "remount (error = %d).\n", status);
+        return status;
+}
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+        struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+        struct super_block *sb = osb->sb;
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+                                        LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+        int status;
+        int type;
+        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                        continue;
+                inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+                                                        osb->slot_num);
+                if (!inode[type]) {
+                        status = -ENOENT;
+                        goto out_quota_off;
+                }
+                status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+                                                DQUOT_USAGE_ENABLED);
+                if (status < 0)
+                        goto out_quota_off;
+        }
+        for (type = 0; type < MAXQUOTAS; type++)
+                iput(inode[type]);
+        return 0;
+out_quota_off:
+        ocfs2_disable_quotas(osb);
+        for (type = 0; type < MAXQUOTAS; type++)
+                iput(inode[type]);
+        mlog_errno(status);
+        return status;
+}
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+        int type;
+        struct inode *inode;
+        struct super_block *sb = osb->sb;
+        /* We mostly ignore errors in this function because there's not much
+         * we can do when we see them */
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!sb_has_quota_loaded(sb, type))
+                        continue;
+                inode = igrab(sb->s_dquot.files[type]);
+                /* Turn off quotas. This will remove all dquot structures from
+                 * memory and so they will be automatically synced to global
+                 * quota files */
+                vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+                                            DQUOT_LIMITS_ENABLED);
+                if (!inode)
+                        continue;
+                iput(inode);
+        }
+}
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+                          char *path, int remount)
+{
+        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+                return -EINVAL;
+        if (remount)
+                return 0;       /* Just ignore it has been handled in
+                                 * ocfs2_remount() */
+        return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
+                                    format_id, DQUOT_LIMITS_ENABLED);
+}
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+{
+        if (remount)
+                return 0;       /* Ignore now and handle later in
+                                 * ocfs2_remount() */
+        return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+static struct quotactl_ops ocfs2_quotactl_ops = {
+        .quota_on       = ocfs2_quota_on,
+        .quota_off      = ocfs2_quota_off,
+        .quota_sync     = vfs_quota_sync,
+        .get_info       = vfs_get_dqinfo,
+        .set_info       = vfs_set_dqinfo,
+        .get_dqblk      = vfs_get_dqblk,
+        .set_dqblk      = vfs_set_dqblk,
+};
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        }
        brelse(bh);
        bh = NULL;
+        if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+                parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
        osb->s_mount_opt = parsed_options.mount_opt;
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
        osb->local_alloc_bits = osb->local_alloc_default_bits;
+        if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                status = -EINVAL;
+                mlog(ML_ERROR, "User quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                goto read_super_error;
+        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                status = -EINVAL;
+                mlog(ML_ERROR, "Group quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                goto read_super_error;
+        }
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = OCFS2_SUPER_MAGIC;
+        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
        /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
         * heartbeat=none */
        if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        atomic_set(&osb->vol_state, VOLUME_MOUNTED);
        wake_up(&osb->osb_mount_event);
+        /* Now we can initialize quotas because we can afford to wait
+         * for cluster locks recovery now. That also means that truncation
+         * log recovery can happen but that waits for proper quota setup */
+        if (!(sb->s_flags & MS_RDONLY)) {
+                status = ocfs2_enable_quotas(osb);
+                if (status < 0) {
+                        /* We have to err-out specially here because
+                         * s_root is already set */
+                        mlog_errno(status);
+                        atomic_set(&osb->vol_state, VOLUME_DISABLED);
+                        wake_up(&osb->osb_mount_event);
+                        mlog_exit(status);
+                        return status;
+                }
+        }
+        ocfs2_complete_quota_recovery(osb);
+        /* Now we wake up again for processes waiting for quotas */
+        atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+        wake_up(&osb->osb_mount_event);
        mlog_exit(status);
        return status;
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
                case Opt_inode64:
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
                        break;
+                case Opt_usrquota:
+                        /* We check only on remount, otherwise features
+                         * aren't yet initialized. */
+                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                            OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                                mlog(ML_ERROR, "User quota requested but "
+                                     "filesystem feature is not set\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+                        break;
+                case Opt_grpquota:
+                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                                mlog(ML_ERROR, "Group quota requested but "
+                                     "filesystem feature is not set\n");
+                                status = 0;
+                                goto bail;
+                        }
+                        mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+                        break;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+                case Opt_acl:
+                        mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+                        break;
+                case Opt_noacl:
+                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+                        break;
+#else
+                case Opt_acl:
+                case Opt_noacl:
+                        printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+                        break;
+#endif
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (osb->osb_cluster_stack[0])
                seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
                           osb->osb_cluster_stack);
+        if (opts & OCFS2_MOUNT_USRQUOTA)
+                seq_printf(s, ",usrquota");
+        if (opts & OCFS2_MOUNT_GRPQUOTA)
+                seq_printf(s, ",grpquota");
        if (opts & OCFS2_MOUNT_NOUSERXATTR)
                seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_INODE64)
                seq_printf(s, ",inode64");
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        if (opts & OCFS2_MOUNT_POSIX_ACL)
+                seq_printf(s, ",acl");
+        else
+                seq_printf(s, ",noacl");
+#endif
        return 0;
 }
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
        }
+        status = ocfs2_quota_setup();
+        if (status)
+                goto leave;
        ocfs2_set_locking_protocol();
+        status = register_quota_format(&ocfs2_quota_format);
 leave:
        if (status < 0) {
+                ocfs2_quota_shutdown();
                ocfs2_free_mem_caches();
                exit_ocfs2_uptodate_cache();
        }
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
 {
        mlog_entry_void();
+        ocfs2_quota_shutdown();
        if (ocfs2_wq) {
                flush_workqueue(ocfs2_wq);
                destroy_workqueue(ocfs2_wq);
        }
+        unregister_quota_format(&ocfs2_quota_format);
        debugfs_remove(ocfs2_debugfs_root);
        ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
                                       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                                SLAB_MEM_SPREAD),
                                       ocfs2_inode_init_once);
-        if (!ocfs2_inode_cachep)
+        ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+                                        sizeof(struct ocfs2_dquot),
+                                        0,
+                                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+                                                SLAB_MEM_SPREAD),
+                                        NULL);
+        ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+                                        sizeof(struct ocfs2_quota_chunk),
+                                        0,
+                                        (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+                                        NULL);
+        if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+            !ocfs2_qf_chunk_cachep) {
+                if (ocfs2_inode_cachep)
+                        kmem_cache_destroy(ocfs2_inode_cachep);
+                if (ocfs2_dquot_cachep)
+                        kmem_cache_destroy(ocfs2_dquot_cachep);
+                if (ocfs2_qf_chunk_cachep)
+                        kmem_cache_destroy(ocfs2_qf_chunk_cachep);
                return -ENOMEM;
+        }
        return 0;
 }
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
 {
        if (ocfs2_inode_cachep)
                kmem_cache_destroy(ocfs2_inode_cachep);
        ocfs2_inode_cachep = NULL;
+        if (ocfs2_dquot_cachep)
+                kmem_cache_destroy(ocfs2_dquot_cachep);
+        ocfs2_dquot_cachep = NULL;
+        if (ocfs2_qf_chunk_cachep)
+                kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+        ocfs2_qf_chunk_cachep = NULL;
 }
 static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        osb = OCFS2_SB(sb);
        BUG_ON(!osb);
+        ocfs2_disable_quotas(osb);
        ocfs2_shutdown_local_alloc(osb);
        ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_fs_info = osb;
        sb->s_op = &ocfs2_sops;
        sb->s_export_op = &ocfs2_export_ops;
+        sb->s_qcop = &ocfs2_quotactl_ops;
+        sb->dq_op = &ocfs2_quota_operations;
        sb->s_xattr = ocfs2_xattr_handlers;
        sb->s_time_gran = 1;
        sb->s_flags |= MS_NOATIME;
@@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
        if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
                   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+                /* We have to do a raw check of the feature here */
+                if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+                    OCFS2_FEATURE_INCOMPAT_META_ECC) {
+                        status = ocfs2_block_check_validate(bh->b_data,
+                                                            bh->b_size,
+                                                            &di->i_check);
+                        if (status)
+                                goto out;
+                }
                status = -EINVAL;
                if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
                        mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
                }
        }
+out:
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b9..ed0a0cfd68d2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
        mlog_entry_void();
-        status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
+        status = ocfs2_read_inode_block(inode, bh);
        if (status < 0) {
                mlog_errno(status);
                link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade13..e1d638af6ac3 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/security.h>
 #define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
 };
 struct ocfs2_xattr_bucket {
-        struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+        /* The inode these xattrs are associated with */
-        struct ocfs2_xattr_header *xh;
+        struct inode *bu_inode;
+        /* The actual buffers that make up the bucket */
+        struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+        /* How many blocks make up one bucket for this filesystem */
+        int bu_blocks;
+};
+struct ocfs2_xattr_set_ctxt {
+        handle_t *handle;
+        struct ocfs2_alloc_context *meta_ac;
+        struct ocfs2_alloc_context *data_ac;
+        struct ocfs2_cached_dealloc_ctxt dealloc;
 };
 #define OCFS2_XATTR_ROOT_SIZE   (sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE 80
+#define OCFS2_XATTR_FREE_IN_IBODY       (OCFS2_MIN_XATTR_INLINE_SIZE \
+                                         - sizeof(struct ocfs2_xattr_header) \
+                                         - sizeof(__u32))
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr)  ((ptr)->i_sb->s_blocksize \
+                                         - sizeof(struct ocfs2_xattr_block) \
+                                         - sizeof(struct ocfs2_xattr_header) \
+                                         - sizeof(__u32))
 static struct ocfs2_xattr_def_value_root def_xv = {
        .xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        &ocfs2_xattr_acl_access_handler,
+        &ocfs2_xattr_acl_default_handler,
+#endif
        &ocfs2_xattr_trusted_handler,
+        &ocfs2_xattr_security_handler,
        NULL
 };
 static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+                                        = &ocfs2_xattr_acl_access_handler,
+        [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+                                        = &ocfs2_xattr_acl_default_handler,
+#endif
        [OCFS2_XATTR_INDEX_TRUSTED]     = &ocfs2_xattr_trusted_handler,
+        [OCFS2_XATTR_INDEX_SECURITY]    = &ocfs2_xattr_security_handler,
 };
 struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
         */
        struct buffer_head *xattr_bh;
        struct ocfs2_xattr_header *header;
-        struct ocfs2_xattr_bucket bucket;
+        struct ocfs2_xattr_bucket *bucket;
        void *base;
        void *end;
        struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
                                        size_t buffer_size);
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-                                          struct ocfs2_xattr_search *xs);
+                                          struct ocfs2_xattr_search *xs,
+                                          struct ocfs2_xattr_set_ctxt *ctxt);
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
                                             struct ocfs2_xattr_info *xi,
-                                             struct ocfs2_xattr_search *xs);
+                                             struct ocfs2_xattr_search *xs,
+                                             struct ocfs2_xattr_set_ctxt *ctxt);
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
                                          struct buffer_head *xb_bh);
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+                                  u64 src_blk, u64 last_blk, u64 to_blk,
+                                  unsigned int start_bucket,
+                                  u32 *first_hash);
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
        return len / sizeof(struct ocfs2_xattr_entry);
 }
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+        struct ocfs2_xattr_bucket *bucket;
+        int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+        bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+        if (bucket) {
+                bucket->bu_inode = inode;
+                bucket->bu_blocks = blks;
+        }
+        return bucket;
+}
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+        int i;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                brelse(bucket->bu_bhs[i]);
+                bucket->bu_bhs[i] = NULL;
+        }
+}
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+        if (bucket) {
+                ocfs2_xattr_bucket_relse(bucket);
+                bucket->bu_inode = NULL;
+                kfree(bucket);
+        }
+}
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read.  We just need the buffer_heads.  Don't call this for
+ * buckets that are already on disk.  ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+                                   u64 xb_blkno)
+{
+        int i, rc = 0;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+                                              xb_blkno + i);
+                if (!bucket->bu_bhs[i]) {
+                        rc = -EIO;
+                        mlog_errno(rc);
+                        break;
+                }
+                if (!ocfs2_buffer_uptodate(bucket->bu_inode,
+                                           bucket->bu_bhs[i]))
+                        ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+                                                      bucket->bu_bhs[i]);
+        }
+        if (rc)
+                ocfs2_xattr_bucket_relse(bucket);
+        return rc;
+}
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+                                   u64 xb_blkno)
+{
+        int rc;
+        rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+                               bucket->bu_blocks, bucket->bu_bhs, 0,
+                               NULL);
+        if (!rc) {
+                rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+                                                 bucket->bu_bhs,
+                                                 bucket->bu_blocks,
+                                                 &bucket_xh(bucket)->xh_check);
+                if (rc)
+                        mlog_errno(rc);
+        }
+        if (rc)
+                ocfs2_xattr_bucket_relse(bucket);
+        return rc;
+}
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+                                             struct ocfs2_xattr_bucket *bucket,
+                                             int type)
+{
+        int i, rc = 0;
+        for (i = 0; i < bucket->bu_blocks; i++) {
+                rc = ocfs2_journal_access(handle, bucket->bu_inode,
+                                          bucket->bu_bhs[i], type);
+                if (rc) {
+                        mlog_errno(rc);
+                        break;
+                }
+        }
+        return rc;
+}
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+                                             struct ocfs2_xattr_bucket *bucket)
+{
+        int i;
+        ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+                                   bucket->bu_bhs, bucket->bu_blocks,
+                                   &bucket_xh(bucket)->xh_check);
+        for (i = 0; i < bucket->bu_blocks; i++)
+                ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+                                         struct ocfs2_xattr_bucket *src)
+{
+        int i;
+        int blocksize = src->bu_inode->i_sb->s_blocksize;
+        BUG_ON(dest->bu_blocks != src->bu_blocks);
+        BUG_ON(dest->bu_inode != src->bu_inode);
+        for (i = 0; i < src->bu_blocks; i++) {
+                memcpy(bucket_block(dest, i), bucket_block(src, i),
+                       blocksize);
+        }
+}
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+                                      struct buffer_head *bh)
+{
+        int rc;
+        struct ocfs2_xattr_block *xb =
+                (struct ocfs2_xattr_block *)bh->b_data;
+        mlog(0, "Validating xattr block %llu\n",
+             (unsigned long long)bh->b_blocknr);
+        BUG_ON(!buffer_uptodate(bh));
+        /*
+         * If the ecc fails, we return the error but otherwise
+         * leave the filesystem running.  We know any error is
+         * local to this block.
+         */
+        rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+        if (rc)
+                return rc;
+        /*
+         * Errors after here are fatal
+         */
+        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has bad "
+                            "signature %.*s",
+                            (unsigned long long)bh->b_blocknr, 7,
+                            xb->xb_signature);
+                return -EINVAL;
+        }
+        if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has an "
+                            "invalid xb_blkno of %llu",
+                            (unsigned long long)bh->b_blocknr,
+                            (unsigned long long)le64_to_cpu(xb->xb_blkno));
+                return -EINVAL;
+        }
+        if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+                ocfs2_error(sb,
+                            "Extended attribute block #%llu has an invalid "
+                            "xb_fs_generation of #%u",
+                            (unsigned long long)bh->b_blocknr,
+                            le32_to_cpu(xb->xb_fs_generation));
+                return -EINVAL;
+        }
+        return 0;
+}
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+                                  struct buffer_head **bh)
+{
+        int rc;
+        struct buffer_head *tmp = *bh;
+        rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+                              ocfs2_validate_xattr_block);
+        /* If ocfs2_read_block() got us a new bh, pass it up. */
+        if (!rc && !*bh)
+                *bh = tmp;
+        return rc;
+}
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
        struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
        return;
 }
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+        int size = 0;
+        if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+        else
+                size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+        size += sizeof(struct ocfs2_xattr_entry);
+        return size;
+}
+int ocfs2_calc_security_init(struct inode *dir,
+                             struct ocfs2_security_xattr_info *si,
+                             int *want_clusters,
+                             int *xattr_credits,
+                             struct ocfs2_alloc_context **xattr_ac)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+                                                 si->value_len);
+        /*
+         * The max space of security xattr taken inline is
+         * 256(name) + 80(value) + 16(entry) = 352 bytes,
+         * So reserve one metadata block for it is ok.
+         */
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+            s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+        /* reserve clusters for xattr value which will be set in B tree*/
+        if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+                                                            si->value_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        return ret;
+}
+int ocfs2_calc_xattr_init(struct inode *dir,
+                          struct buffer_head *dir_bh,
+                          int mode,
+                          struct ocfs2_security_xattr_info *si,
+                          int *want_clusters,
+                          int *xattr_credits,
+                          struct ocfs2_alloc_context **xattr_ac)
+{
+        int ret = 0;
+        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+        int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+        if (si->enable)
+                s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+                                                     si->value_len);
+        if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+                acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+                                        OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+                                        "", NULL, 0);
+                if (acl_len > 0) {
+                        a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+                        if (S_ISDIR(mode))
+                                a_size <<= 1;
+                } else if (acl_len != 0 && acl_len != -ENODATA) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+        }
+        if (!(s_size + a_size))
+                return ret;
+        /*
+         * The max space of security xattr taken inline is
+         * 256(name) + 80(value) + 16(entry) = 352 bytes,
+         * The max space of acl xattr taken inline is
+         * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+         * when blocksize = 512, may reserve one more cluser for
+         * xattr bucket, otherwise reserve one metadata block
+         * for them is ok.
+         */
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+            (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        return ret;
+                }
+                *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+        if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+            (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+                *want_clusters += 1;
+                *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+        }
+        /*
+         * reserve credits and clusters for xattrs which has large value
+         * and have to be set outside
+         */
+        if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+                                                        si->value_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+            acl_len > OCFS2_XATTR_INLINE_SIZE) {
+                /* for directory, it has DEFAULT and ACCESS two types of acls */
+                new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+                                ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+                *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+                                                           new_clusters);
+                *want_clusters += new_clusters;
+        }
+        return ret;
+}
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
                                         u32 clusters_to_add,
-                                         struct buffer_head *xattr_bh,
+                                         struct ocfs2_xattr_value_buf *vb,
-                                         struct ocfs2_xattr_value_root *xv)
+                                         struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int status = 0;
-        int restart_func = 0;
+        handle_t *handle = ctxt->handle;
-        int credits = 0;
-        handle_t *handle = NULL;
-        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        enum ocfs2_alloc_restarted why;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+        u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
        struct ocfs2_extent_tree et;
        mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
-        ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+        ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-restart_all:
-        status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
-                                       &data_ac, &meta_ac);
-        if (status) {
-                mlog_errno(status);
-                goto leave;
-        }
-        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-                                            clusters_to_add);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(status);
-                goto leave;
-        }
-restarted_transaction:
+        status = vb->vb_access(handle, inode, vb->vb_bh,
-        status = ocfs2_journal_access(handle, inode, xattr_bh,
+                              OCFS2_JOURNAL_ACCESS_WRITE);
-                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        prev_clusters = le32_to_cpu(xv->xr_clusters);
+        prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
        status = ocfs2_add_clusters_in_btree(osb,
                                             inode,
                                             &logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
                                             0,
                                             &et,
                                             handle,
-                                             data_ac,
+                                             ctxt->data_ac,
-                                             meta_ac,
+                                             ctxt->meta_ac,
                                             &why);
-        if ((status < 0) && (status != -EAGAIN)) {
+        if (status < 0) {
-                if (status != -ENOSPC)
+                mlog_errno(status);
-                        mlog_errno(status);
                goto leave;
        }
-        status = ocfs2_journal_dirty(handle, xattr_bh);
+        status = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
-        clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+        clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
-        if (why != RESTART_NONE && clusters_to_add) {
+        /*
-                if (why == RESTART_META) {
+         * We should have already allocated enough space before the transaction,
-                        mlog(0, "restarting function.\n");
+         * so no need to restart.
-                        restart_func = 1;
+         */
-                } else {
+        BUG_ON(why != RESTART_NONE || clusters_to_add);
-                        BUG_ON(why != RESTART_TRANS);
-                        mlog(0, "restarting transaction.\n");
-                        /* TODO: This can be more intelligent. */
-                        credits = ocfs2_calc_extend_credits(osb->sb,
-                                                            et.et_root_el,
-                                                            clusters_to_add);
-                        status = ocfs2_extend_trans(handle, credits);
-                        if (status < 0) {
-                                /* handle still has to be committed at
-                                 * this point. */
-                                status = -ENOMEM;
-                                mlog_errno(status);
-                                goto leave;
-                        }
-                        goto restarted_transaction;
-                }
-        }
 leave:
-        if (handle) {
-                ocfs2_commit_trans(osb, handle);
-                handle = NULL;
-        }
-        if (data_ac) {
-                ocfs2_free_alloc_context(data_ac);
-                data_ac = NULL;
-        }
-        if (meta_ac) {
-                ocfs2_free_alloc_context(meta_ac);
-                meta_ac = NULL;
-        }
-        if ((!status) && restart_func) {
-                restart_func = 0;
-                goto restart_all;
-        }
        return status;
 }
 static int __ocfs2_remove_xattr_range(struct inode *inode,
-                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_buf *vb,
-                                      struct ocfs2_xattr_value_root *xv,
                                      u32 cpos, u32 phys_cpos, u32 len,
-                                      struct ocfs2_cached_dealloc_ctxt *dealloc)
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret;
        u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        handle_t *handle = ctxt->handle;
-        struct inode *tl_inode = osb->osb_tl_inode;
-        handle_t *handle;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        struct ocfs2_extent_tree et;
-        ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+        ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-        ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
+                            OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                return ret;
-        }
-        mutex_lock(&tl_inode->i_mutex);
-        if (ocfs2_truncate_log_needs_flush(osb)) {
-                ret = __ocfs2_flush_truncate_log(osb);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                  &ctxt->dealloc);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-                                  dealloc);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        le32_add_cpu(&xv->xr_clusters, -len);
+        le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
-        ret = ocfs2_journal_dirty(handle, root_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+        ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
        if (ret)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
 out:
-        mutex_unlock(&tl_inode->i_mutex);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
        return ret;
 }
 static int ocfs2_xattr_shrink_size(struct inode *inode,
                                   u32 old_clusters,
                                   u32 new_clusters,
-                                   struct buffer_head *root_bh,
+                                   struct ocfs2_xattr_value_buf *vb,
-                                   struct ocfs2_xattr_value_root *xv)
+                                   struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret = 0;
        u32 trunc_len, cpos, phys_cpos, alloc_size;
        u64 block;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_cached_dealloc_ctxt dealloc;
-        ocfs2_init_dealloc_ctxt(&dealloc);
        if (old_clusters <= new_clusters)
                return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
        trunc_len = old_clusters - new_clusters;
        while (trunc_len) {
                ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
-                                               &alloc_size, &xv->xr_list);
+                                               &alloc_size,
+                                               &vb->vb_xv->xr_list);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
                if (alloc_size > trunc_len)
                        alloc_size = trunc_len;
-                ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+                ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
                                                 phys_cpos, alloc_size,
-                                                 &dealloc);
+                                                 ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
        }
 out:
-        ocfs2_schedule_truncate_log_flush(osb, 1);
-        ocfs2_run_deallocs(osb, &dealloc);
        return ret;
 }
 static int ocfs2_xattr_value_truncate(struct inode *inode,
-                                      struct buffer_head *root_bh,
+                                      struct ocfs2_xattr_value_buf *vb,
-                                      struct ocfs2_xattr_value_root *xv,
+                                      int len,
-                                      int len)
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret;
        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
-        u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+        u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
        if (new_clusters == old_clusters)
                return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
        if (new_clusters > old_clusters)
                ret = ocfs2_xattr_extend_allocation(inode,
                                                    new_clusters - old_clusters,
-                                                    root_bh, xv);
+                                                    vb, ctxt);
        else
                ret = ocfs2_xattr_shrink_size(inode,
                                              old_clusters, new_clusters,
-                                              root_bh, xv);
+                                              vb, ctxt);
        return ret;
 }
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
        if (!di->i_xattr_loc)
                return ret;
-        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+                                     &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto cleanup;
-        }
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
                ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
                ret = ocfs2_xattr_tree_list_index_block(inode, xt,
                                                   buffer, buffer_size);
        }
-cleanup:
        brelse(blk_bh);
        return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
                /* Copy ocfs2_xattr_value */
                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        ret = ocfs2_read_block(inode, blkno, &bh, NULL);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        size_t size;
        int ret = -ENODATA, name_offset, name_len, block_off, i;
-        memset(&xs->bucket, 0, sizeof(xs->bucket));
+        xs->bucket = ocfs2_xattr_bucket_new(inode);
+        if (!xs->bucket) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto cleanup;
+        }
        ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
        if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
                        ret = ocfs2_xattr_bucket_get_name_value(inode,
-                                                                xs->bucket.xh,
+                                                                bucket_xh(xs->bucket),
                                                                i,
                                                                &block_off,
                                                                &name_offset);
-                        xs->base = xs->bucket.bhs[block_off]->b_data;
+                        xs->base = bucket_block(xs->bucket, block_off);
                }
                if (ocfs2_xattr_is_local(xs->here)) {
                        memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        }
        ret = size;
 cleanup:
-        for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+        ocfs2_xattr_bucket_free(xs->bucket);
-                brelse(xs->bucket.bhs[i]);
-        memset(&xs->bucket, 0, sizeof(xs->bucket));
        brelse(xs->xattr_bh);
        xs->xattr_bh = NULL;
        return ret;
 }
-/* ocfs2_xattr_get()
+int ocfs2_xattr_get_nolock(struct inode *inode,
- *
+                           struct buffer_head *di_bh,
- * Copy an extended attribute into the buffer provided.
- * Buffer is NULL to compute the size of buffer required.
- */
-static int ocfs2_xattr_get(struct inode *inode,
                           int name_index,
                           const char *name,
                           void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
 {
        int ret;
        struct ocfs2_dinode *di = NULL;
-        struct buffer_head *di_bh = NULL;
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_xattr_search xis = {
                .not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
        if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
                ret = -ENODATA;
-        ret = ocfs2_inode_lock(inode, &di_bh, 0);
-        if (ret < 0) {
-                mlog_errno(ret);
-                return ret;
-        }
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
                ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
                                            buffer_size, &xbs);
        up_read(&oi->ip_xattr_sem);
+        return ret;
+}
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+                           int name_index,
+                           const char *name,
+                           void *buffer,
+                           size_t buffer_size)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 0);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+                                     name, buffer, buffer_size);
        ocfs2_inode_unlock(inode, 0);
        brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
 }
 static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+                                           handle_t *handle,
                                           struct ocfs2_xattr_value_root *xv,
                                           const void *value,
                                           int value_len)
 {
-        int ret = 0, i, cp_len, credits;
+        int ret = 0, i, cp_len;
        u16 blocksize = inode->i_sb->s_blocksize;
        u32 p_cluster, num_clusters;
        u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
        u64 blkno;
        struct buffer_head *bh = NULL;
-        handle_t *handle;
        BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
-        credits = clusters * bpc;
-        handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
        while (cpos < clusters) {
                ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
                                               &num_clusters, &xv->xr_list);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
                blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
                for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-                        ret = ocfs2_read_block(inode, blkno, &bh);
+                        ret = ocfs2_read_block(inode, blkno, &bh, NULL);
                        if (ret) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                                                   OCFS2_JOURNAL_ACCESS_WRITE);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                        ret = ocfs2_journal_dirty(handle, bh);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                goto out_commit;
+                                goto out;
                        }
                        brelse(bh);
                        bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
                }
                cpos += num_clusters;
        }
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        brelse(bh);
@@ -960,28 +1249,22 @@ out:
 }
 static int ocfs2_xattr_cleanup(struct inode *inode,
+                               handle_t *handle,
                               struct ocfs2_xattr_info *xi,
                               struct ocfs2_xattr_search *xs,
+                               struct ocfs2_xattr_value_buf *vb,
                               size_t offs)
 {
-        handle_t *handle = NULL;
        int ret = 0;
        size_t name_len = strlen(xi->name);
        void *val = xs->base + offs;
        size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
-                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+                            OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        /* Decrease xattr count */
        le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
        memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
        memset(val, 0, size);
-        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        return ret;
 }
 static int ocfs2_xattr_update_entry(struct inode *inode,
+                                    handle_t *handle,
                                    struct ocfs2_xattr_info *xi,
                                    struct ocfs2_xattr_search *xs,
+                                    struct ocfs2_xattr_value_buf *vb,
                                    size_t offs)
 {
-        handle_t *handle = NULL;
+        int ret;
-        int ret = 0;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = vb->vb_access(handle, inode, vb->vb_bh,
-                                   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+                            OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
                ocfs2_xattr_set_local(xs->here, 0);
        ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
-        ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+        ret = ocfs2_journal_dirty(handle, vb->vb_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
        return ret;
 }
@@ -1045,6 +1318,8 @@ out:
 static int ocfs2_xattr_set_value_outside(struct inode *inode,
                                         struct ocfs2_xattr_info *xi,
                                         struct ocfs2_xattr_search *xs,
+                                         struct ocfs2_xattr_set_ctxt *ctxt,
+                                         struct ocfs2_xattr_value_buf *vb,
                                         size_t offs)
 {
        size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
        xv->xr_list.l_tree_depth = 0;
        xv->xr_list.l_count = cpu_to_le16(1);
        xv->xr_list.l_next_free_rec = 0;
+        vb->vb_xv = xv;
-        ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+        ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
-                                         xi->value_len);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+        ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
-                                              xi->value_len);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+        ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+                                              xi->value, xi->value_len);
        if (ret < 0)
                mlog_errno(ret);
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 static int ocfs2_xattr_set_entry(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt,
                                 int flag)
 {
        struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
        size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
        size_t size_l = 0;
-        handle_t *handle = NULL;
+        handle_t *handle = ctxt->handle;
        int free, i, ret;
        struct ocfs2_xattr_info xi_l = {
                .name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                .value = xi->value,
                .value_len = xi->value_len,
        };
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = xs->xattr_bh,
+                .vb_access = ocfs2_journal_access_di,
+        };
+        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+                BUG_ON(xs->xattr_bh == xs->inode_bh);
+                vb.vb_access = ocfs2_journal_access_xb;
+        } else
+                BUG_ON(xs->xattr_bh != xs->inode_bh);
        /* Compute min_offs, last and free space. */
        last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
                        /* Replace existing local xattr with tree root */
                        ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
-                                                            offs);
+                                                            ctxt, &vb, offs);
                        if (ret < 0)
                                mlog_errno(ret);
                        goto out;
                } else if (!ocfs2_xattr_is_local(xs->here)) {
                        /* For existing xattr which has value outside */
-                        struct ocfs2_xattr_value_root *xv = NULL;
+                        vb.vb_xv = (struct ocfs2_xattr_value_root *)
-                        xv = (struct ocfs2_xattr_value_root *)(val +
+                                (val + OCFS2_XATTR_SIZE(name_len));
-                                OCFS2_XATTR_SIZE(name_len));
                        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
                                /*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                                 * then set new value with set_value_outside().
                                 */
                                ret = ocfs2_xattr_value_truncate(inode,
-                                                                 xs->xattr_bh,
+                                                                 &vb,
-                                                                 xv,
+                                                                 xi->value_len,
-                                                                 xi->value_len);
+                                                                 ctxt);
                                if (ret < 0) {
                                        mlog_errno(ret);
                                        goto out;
                                }
-                                ret = __ocfs2_xattr_set_value_outside(inode,
+                                ret = ocfs2_xattr_update_entry(inode,
-                                                                xv,
+                                                               handle,
-                                                                xi->value,
+                                                               xi,
-                                                                xi->value_len);
+                                                               xs,
+                                                               &vb,
+                                                               offs);
                                if (ret < 0) {
                                        mlog_errno(ret);
                                        goto out;
                                }
-                                ret = ocfs2_xattr_update_entry(inode,
+                                ret = __ocfs2_xattr_set_value_outside(inode,
-                                                               xi,
+                                                                handle,
-                                                               xs,
+                                                                vb.vb_xv,
-                                                               offs);
+                                                                xi->value,
+                                                                xi->value_len);
                                if (ret < 0)
                                        mlog_errno(ret);
                                goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                                 * just trucate old value to zero.
                                 */
                                 ret = ocfs2_xattr_value_truncate(inode,
-                                                                 xs->xattr_bh,
+                                                                  &vb,
-                                                                 xv,
+                                                                  0,
-                                                                 0);
+                                                                  ctxt);
                                if (ret < 0)
                                        mlog_errno(ret);
                        }
                }
        }
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+        ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
-                                   OCFS2_INODE_UPDATE_CREDITS);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-                /* set extended attribute in external block. */
+                ret = vb.vb_access(handle, inode, vb.vb_bh,
-                ret = ocfs2_extend_trans(handle,
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
-                                         OCFS2_INODE_UPDATE_CREDITS +
-                                         OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
-                ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
                if (ret) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
        }
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
                ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto out;
                }
        }
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
        oi->ip_dyn_features |= flag;
        di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
        spin_unlock(&oi->ip_lock);
-        /* Update inode ctime */
-        inode->i_ctime = CURRENT_TIME;
-        di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
-        di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
        ret = ocfs2_journal_dirty(handle, xs->inode_bh);
        if (ret < 0)
                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
        if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
                /*
                 * Set value outside in B tree.
                 * This is the second step for value size > INLINE_SIZE.
                 */
                size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+                ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
+                                                    &vb, offs);
                if (ret < 0) {
                        int ret2;
@@ -1418,41 +1684,56 @@ out_commit:
                         * If set value outside failed, we have to clean
                         * the junk tree root we have already set in local.
                         */
-                        ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+                        ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
+                                                   xi, xs, &vb, offs);
                        if (ret2 < 0)
                                mlog_errno(ret2);
                }
        }
 out:
        return ret;
 }
 static int ocfs2_remove_value_outside(struct inode*inode,
-                                      struct buffer_head *bh,
+                                      struct ocfs2_xattr_value_buf *vb,
                                      struct ocfs2_xattr_header *header)
 {
        int ret = 0, i;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+        ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+        ctxt.handle = ocfs2_start_trans(osb,
+                                        ocfs2_remove_extent_credits(osb->sb));
+        if (IS_ERR(ctxt.handle)) {
+                ret = PTR_ERR(ctxt.handle);
+                mlog_errno(ret);
+                goto out;
+        }
        for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
                struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
                if (!ocfs2_xattr_is_local(entry)) {
-                        struct ocfs2_xattr_value_root *xv;
                        void *val;
                        val = (void *)header +
                                le16_to_cpu(entry->xe_name_offset);
-                        xv = (struct ocfs2_xattr_value_root *)
+                        vb->vb_xv = (struct ocfs2_xattr_value_root *)
                                (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-                        ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+                        ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
                        if (ret < 0) {
                                mlog_errno(ret);
-                                return ret;
+                                break;
                        }
                }
        }
+        ocfs2_commit_trans(osb, ctxt.handle);
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
+out:
        return ret;
 }
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
        struct ocfs2_xattr_header *header;
        int ret;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = di_bh,
+                .vb_access = ocfs2_journal_access_di,
+        };
        header = (struct ocfs2_xattr_header *)
                 ((void *)di + inode->i_sb->s_blocksize -
                 le16_to_cpu(di->i_xattr_inline_size));
-        ret = ocfs2_remove_value_outside(inode, di_bh, header);
+        ret = ocfs2_remove_value_outside(inode, &vb, header);
        return ret;
 }
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 {
        struct ocfs2_xattr_block *xb;
        int ret = 0;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_bh = blk_bh,
+                .vb_access = ocfs2_journal_access_xb,
+        };
        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
-                ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+                ret = ocfs2_remove_value_outside(inode, &vb, header);
        } else
                ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
        u64 blk, bg_blkno;
        u16 bit;
-        ret = ocfs2_read_block(inode, block, &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto out;
-        }
        ret = ocfs2_xattr_block_remove(inode, blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        blk = le64_to_cpu(xb->xb_blkno);
        bit = le16_to_cpu(xb->xb_suballoc_bit);
        bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access_di(handle, inode, di_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
 */
 static int ocfs2_xattr_ibody_set(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
-                                 struct ocfs2_xattr_search *xs)
+                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
                }
        }
-        ret = ocfs2_xattr_set_entry(inode, xi, xs,
+        ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
                                (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
 out:
        up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
        if (!di->i_xattr_loc)
                return ret;
-        ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+        ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+                                     &blk_bh);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
        }
-        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-        if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-                ret = -EIO;
-                goto cleanup;
-        }
        xs->xattr_bh = blk_bh;
+        xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
        if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
                xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
 */
 static int ocfs2_xattr_block_set(struct inode *inode,
                                 struct ocfs2_xattr_info *xi,
-                                 struct ocfs2_xattr_search *xs)
+                                 struct ocfs2_xattr_search *xs,
+                                 struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct buffer_head *new_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
-        struct ocfs2_alloc_context *meta_ac = NULL;
+        handle_t *handle = ctxt->handle;
-        handle_t *handle = NULL;
        struct ocfs2_xattr_block *xblk = NULL;
        u16 suballoc_bit_start;
        u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
        int ret;
        if (!xs->xattr_bh) {
-                /*
+                ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
-                 * Alloc one external block for extended attribute
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
-                 * outside of inode.
-                 */
-                ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out;
+                        goto end;
-                }
-                handle = ocfs2_start_trans(osb,
-                                           OCFS2_XATTR_BLOCK_CREATE_CREDITS);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        mlog_errno(ret);
-                        goto out;
-                }
-                ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out_commit;
                }
-                ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+                ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
                                           &suballoc_bit_start, &num_got,
                                           &first_blkno);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                new_bh = sb_getblk(inode->i_sb, first_blkno);
                ocfs2_set_new_buffer_uptodate(inode, new_bh);
-                ret = ocfs2_journal_access(handle, inode, new_bh,
+                ret = ocfs2_journal_access_xb(handle, inode, new_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                                              OCFS2_JOURNAL_ACCESS_CREATE);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                /* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
                xs->end = (void *)xblk + inode->i_sb->s_blocksize;
                xs->here = xs->header->xh_entries;
                ret = ocfs2_journal_dirty(handle, new_bh);
                if (ret < 0) {
                        mlog_errno(ret);
-                        goto out_commit;
+                        goto end;
                }
                di->i_xattr_loc = cpu_to_le64(first_blkno);
-                ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+                ocfs2_journal_dirty(handle, xs->inode_bh);
-                if (ret < 0)
-                        mlog_errno(ret);
-out_commit:
-                ocfs2_commit_trans(osb, handle);
-out:
-                if (meta_ac)
-                        ocfs2_free_alloc_context(meta_ac);
-                if (ret < 0)
-                        return ret;
        } else
                xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
        if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
                /* Set extended attribute into external block */
-                ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+                ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+                                            OCFS2_HAS_XATTR_FL);
                if (!ret || ret != -ENOSPC)
                        goto end;
-                ret = ocfs2_xattr_create_index_block(inode, xs);
+                ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
                if (ret)
                        goto end;
        }
-        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+        ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
 end:
        return ret;
 }
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+                                       struct ocfs2_xattr_info *xi,
+                                       struct ocfs2_xattr_search *xs)
+{
+        u64 value_size;
+        struct ocfs2_xattr_entry *last;
+        int free, i;
+        size_t min_offs = xs->end - xs->base;
+        if (!xs->header)
+                return 0;
+        last = xs->header->xh_entries;
+        for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+                size_t offs = le16_to_cpu(last->xe_name_offset);
+                if (offs < min_offs)
+                        min_offs = offs;
+                last += 1;
+        }
+        free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+        if (free < 0)
+                return 0;
+        BUG_ON(!xs->not_found);
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                value_size = OCFS2_XATTR_ROOT_SIZE;
+        else
+                value_size = OCFS2_XATTR_SIZE(xi->value_len);
+        if (free >= sizeof(struct ocfs2_xattr_entry) +
+                   OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+                return 1;
+        return 0;
+}
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+                                     struct ocfs2_dinode *di,
+                                     struct ocfs2_xattr_info *xi,
+                                     struct ocfs2_xattr_search *xis,
+                                     struct ocfs2_xattr_search *xbs,
+                                     int *clusters_need,
+                                     int *meta_need,
+                                     int *credits_need)
+{
+        int ret = 0, old_in_xb = 0;
+        int clusters_add = 0, meta_add = 0, credits = 0;
+        struct buffer_head *bh = NULL;
+        struct ocfs2_xattr_block *xb = NULL;
+        struct ocfs2_xattr_entry *xe = NULL;
+        struct ocfs2_xattr_value_root *xv = NULL;
+        char *base = NULL;
+        int name_offset, name_len = 0;
+        u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+                                                    xi->value_len);
+        u64 value_size;
+        /*
+         * Calculate the clusters we need to write.
+         * No matter whether we replace an old one or add a new one,
+         * we need this for writing.
+         */
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+                credits += new_clusters *
+                           ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        if (xis->not_found && xbs->not_found) {
+                credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                        clusters_add += new_clusters;
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                        &def_xv.xv.xr_list,
+                                                        new_clusters);
+                }
+                goto meta_guess;
+        }
+        if (!xis->not_found) {
+                xe = xis->here;
+                name_offset = le16_to_cpu(xe->xe_name_offset);
+                name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                base = xis->base;
+                credits += OCFS2_INODE_UPDATE_CREDITS;
+        } else {
+                int i, block_off = 0;
+                xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+                xe = xbs->here;
+                name_offset = le16_to_cpu(xe->xe_name_offset);
+                name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+                i = xbs->here - xbs->header->xh_entries;
+                old_in_xb = 1;
+                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+                        ret = ocfs2_xattr_bucket_get_name_value(inode,
+                                                        bucket_xh(xbs->bucket),
+                                                        i, &block_off,
+                                                        &name_offset);
+                        base = bucket_block(xbs->bucket, block_off);
+                        credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                } else {
+                        base = xbs->base;
+                        credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+                }
+        }
+        /*
+         * delete a xattr doesn't need metadata and cluster allocation.
+         * so just calculate the credits and return.
+         *
+         * The credits for removing the value tree will be extended
+         * by ocfs2_remove_extent itself.
+         */
+        if (!xi->value) {
+                if (!ocfs2_xattr_is_local(xe))
+                        credits += ocfs2_remove_extent_credits(inode->i_sb);
+                goto out;
+        }
+        /* do cluster allocation guess first. */
+        value_size = le64_to_cpu(xe->xe_value_size);
+        if (old_in_xb) {
+                /*
+                 * In xattr set, we always try to set the xe in inode first,
+                 * so if it can be inserted into inode successfully, the old
+                 * one will be removed from the xattr block, and this xattr
+                 * will be inserted into inode as a new xattr in inode.
+                 */
+                if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+                        clusters_add += new_clusters;
+                        credits += ocfs2_remove_extent_credits(inode->i_sb) +
+                                    OCFS2_INODE_UPDATE_CREDITS;
+                        if (!ocfs2_xattr_is_local(xe))
+                                credits += ocfs2_calc_extend_credits(
+                                                        inode->i_sb,
+                                                        &def_xv.xv.xr_list,
+                                                        new_clusters);
+                        goto out;
+                }
+        }
+        if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+                /* the new values will be stored outside. */
+                u32 old_clusters = 0;
+                if (!ocfs2_xattr_is_local(xe)) {
+                        old_clusters =  ocfs2_clusters_for_bytes(inode->i_sb,
+                                                                 value_size);
+                        xv = (struct ocfs2_xattr_value_root *)
+                             (base + name_offset + name_len);
+                        value_size = OCFS2_XATTR_ROOT_SIZE;
+                } else
+                        xv = &def_xv.xv;
+                if (old_clusters >= new_clusters) {
+                        credits += ocfs2_remove_extent_credits(inode->i_sb);
+                        goto out;
+                } else {
+                        meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+                        clusters_add += new_clusters - old_clusters;
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                             &xv->xr_list,
+                                                             new_clusters -
+                                                             old_clusters);
+                        if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+                                goto out;
+                }
+        } else {
+                /*
+                 * Now the new value will be stored inside. So if the new
+                 * value is smaller than the size of value root or the old
+                 * value, we don't need any allocation, otherwise we have
+                 * to guess metadata allocation.
+                 */
+                if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+                    (!ocfs2_xattr_is_local(xe) &&
+                     OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+                        goto out;
+        }
+meta_guess:
+        /* calculate metadata allocation. */
+        if (di->i_xattr_loc) {
+                if (!xbs->xattr_bh) {
+                        ret = ocfs2_read_xattr_block(inode,
+                                                     le64_to_cpu(di->i_xattr_loc),
+                                                     &bh);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        xb = (struct ocfs2_xattr_block *)bh->b_data;
+                } else
+                        xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+                /*
+                 * If there is already an xattr tree, good, we can calculate
+                 * like other b-trees. Otherwise we may have the chance of
+                 * create a tree, the credit calculation is borrowed from
+                 * ocfs2_calc_extend_credits with root_el = NULL. And the
+                 * new tree will be cluster based, so no meta is needed.
+                 */
+                if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+                        struct ocfs2_extent_list *el =
+                                 &xb->xb_attrs.xb_root.xt_list;
+                        meta_add += ocfs2_extend_meta_needed(el);
+                        credits += ocfs2_calc_extend_credits(inode->i_sb,
+                                                             el, 1);
+                } else
+                        credits += OCFS2_SUBALLOC_ALLOC + 1;
+                /*
+                 * This cluster will be used either for new bucket or for
+                 * new xattr block.
+                 * If the cluster size is the same as the bucket size, one
+                 * more is needed since we may need to extend the bucket
+                 * also.
+                 */
+                clusters_add += 1;
+                credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                if (OCFS2_XATTR_BUCKET_SIZE ==
+                        OCFS2_SB(inode->i_sb)->s_clustersize) {
+                        credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+                        clusters_add += 1;
+                }
+        } else {
+                meta_add += 1;
+                credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+        }
+out:
+        if (clusters_need)
+                *clusters_need = clusters_add;
+        if (meta_need)
+                *meta_need = meta_add;
+        if (credits_need)
+                *credits_need = credits;
+        brelse(bh);
+        return ret;
+}
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+                                     struct ocfs2_dinode *di,
+                                     struct ocfs2_xattr_info *xi,
+                                     struct ocfs2_xattr_search *xis,
+                                     struct ocfs2_xattr_search *xbs,
+                                     struct ocfs2_xattr_set_ctxt *ctxt,
+                                     int *credits)
+{
+        int clusters_add, meta_add, ret;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+        ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+        ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+                                        &clusters_add, &meta_add, credits);
+        if (ret) {
+                mlog_errno(ret);
+                return ret;
+        }
+        mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
+             "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+        if (meta_add) {
+                ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+                                                        &ctxt->meta_ac);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+        }
+        if (clusters_add) {
+                ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+                if (ret)
+                        mlog_errno(ret);
+        }
+out:
+        if (ret) {
+                if (ctxt->meta_ac) {
+                        ocfs2_free_alloc_context(ctxt->meta_ac);
+                        ctxt->meta_ac = NULL;
+                }
+                /*
+                 * We cannot have an error and a non null ctxt->data_ac.
+                 */
+        }
+        return ret;
+}
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+                                    struct ocfs2_dinode *di,
+                                    struct ocfs2_xattr_info *xi,
+                                    struct ocfs2_xattr_search *xis,
+                                    struct ocfs2_xattr_search *xbs,
+                                    struct ocfs2_xattr_set_ctxt *ctxt)
+{
+        int ret = 0, credits, old_found;
+        if (!xi->value) {
+                /* Remove existing extended attribute */
+                if (!xis->not_found)
+                        ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+                else if (!xbs->not_found)
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+        } else {
+                /* We always try to set extended attribute into inode first*/
+                ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+                if (!ret && !xbs->not_found) {
+                        /*
+                         * If succeed and that extended attribute existing in
+                         * external block, then we will remove it.
+                         */
+                        xi->value = NULL;
+                        xi->value_len = 0;
+                        old_found = xis->not_found;
+                        xis->not_found = -ENODATA;
+                        ret = ocfs2_calc_xattr_set_need(inode,
+                                                        di,
+                                                        xi,
+                                                        xis,
+                                                        xbs,
+                                                        NULL,
+                                                        NULL,
+                                                        &credits);
+                        xis->not_found = old_found;
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                        ctxt->handle->h_buffer_credits);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto out;
+                        }
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+                } else if (ret == -ENOSPC) {
+                        if (di->i_xattr_loc && !xbs->xattr_bh) {
+                                ret = ocfs2_xattr_block_find(inode,
+                                                             xi->name_index,
+                                                             xi->name, xbs);
+                                if (ret)
+                                        goto out;
+                                old_found = xis->not_found;
+                                xis->not_found = -ENODATA;
+                                ret = ocfs2_calc_xattr_set_need(inode,
+                                                                di,
+                                                                xi,
+                                                                xis,
+                                                                xbs,
+                                                                NULL,
+                                                                NULL,
+                                                                &credits);
+                                xis->not_found = old_found;
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                        ctxt->handle->h_buffer_credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                        }
+                        /*
+                         * If no space in inode, we will set extended attribute
+                         * into external block.
+                         */
+                        ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+                        if (ret)
+                                goto out;
+                        if (!xis->not_found) {
+                                /*
+                                 * If succeed and that extended attribute
+                                 * existing in inode, we will remove it.
+                                 */
+                                xi->value = NULL;
+                                xi->value_len = 0;
+                                xbs->not_found = -ENODATA;
+                                ret = ocfs2_calc_xattr_set_need(inode,
+                                                                di,
+                                                                xi,
+                                                                xis,
+                                                                xbs,
+                                                                NULL,
+                                                                NULL,
+                                                                &credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_extend_trans(ctxt->handle, credits +
+                                                ctxt->handle->h_buffer_credits);
+                                if (ret) {
+                                        mlog_errno(ret);
+                                        goto out;
+                                }
+                                ret = ocfs2_xattr_ibody_set(inode, xi,
+                                                            xis, ctxt);
+                        }
+                }
+        }
+        if (!ret) {
+                /* Update inode ctime. */
+                ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
+                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                inode->i_ctime = CURRENT_TIME;
+                di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+                di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+                ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+        }
+out:
+        return ret;
+}
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+                           struct inode *inode,
+                           struct buffer_head *di_bh,
+                           int name_index,
+                           const char *name,
+                           const void *value,
+                           size_t value_len,
+                           int flags,
+                           struct ocfs2_alloc_context *meta_ac,
+                           struct ocfs2_alloc_context *data_ac)
+{
+        struct ocfs2_dinode *di;
+        int ret;
+        struct ocfs2_xattr_info xi = {
+                .name_index = name_index,
+                .name = name,
+                .value = value,
+                .value_len = value_len,
+        };
+        struct ocfs2_xattr_search xis = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_search xbs = {
+                .not_found = -ENODATA,
+        };
+        struct ocfs2_xattr_set_ctxt ctxt = {
+                .handle = handle,
+                .meta_ac = meta_ac,
+                .data_ac = data_ac,
+        };
+        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+                return -EOPNOTSUPP;
+        /*
+         * In extreme situation, may need xattr bucket when
+         * block size is too small. And we have already reserved
+         * the credits for bucket in mknod.
+         */
+        if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+                xbs.bucket = ocfs2_xattr_bucket_new(inode);
+                if (!xbs.bucket) {
+                        mlog_errno(-ENOMEM);
+                        return -ENOMEM;
+                }
+        }
+        xis.inode_bh = xbs.inode_bh = di_bh;
+        di = (struct ocfs2_dinode *)di_bh->b_data;
+        down_write(&OCFS2_I(inode)->ip_xattr_sem);
+        ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+        if (ret)
+                goto cleanup;
+        if (xis.not_found) {
+                ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+                if (ret)
+                        goto cleanup;
+        }
+        ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+cleanup:
+        up_write(&OCFS2_I(inode)->ip_xattr_sem);
+        brelse(xbs.xattr_bh);
+        ocfs2_xattr_bucket_free(xbs.bucket);
+        return ret;
+}
 /*
 * ocfs2_xattr_set()
 *
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
 {
        struct buffer_head *di_bh = NULL;
        struct ocfs2_dinode *di;
-        int ret;
+        int ret, credits;
-        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct inode *tl_inode = osb->osb_tl_inode;
+        struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
        struct ocfs2_xattr_info xi = {
                .name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
        if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
                return -EOPNOTSUPP;
+        /*
+         * Only xbs will be used on indexed trees.  xis doesn't need a
+         * bucket.
+         */
+        xbs.bucket = ocfs2_xattr_bucket_new(inode);
+        if (!xbs.bucket) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
        ret = ocfs2_inode_lock(inode, &di_bh, 1);
        if (ret < 0) {
                mlog_errno(ret);
-                return ret;
+                goto cleanup_nolock;
        }
        xis.inode_bh = xbs.inode_bh = di_bh;
        di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
                        goto cleanup;
        }
-        if (!value) {
-                /* Remove existing extended attribute */
+        mutex_lock(&tl_inode->i_mutex);
-                if (!xis.not_found)
-                        ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+        if (ocfs2_truncate_log_needs_flush(osb)) {
-                else if (!xbs.not_found)
+                ret = __ocfs2_flush_truncate_log(osb);
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+                if (ret < 0) {
-        } else {
+                        mutex_unlock(&tl_inode->i_mutex);
-                /* We always try to set extended attribute into inode first*/
+                        mlog_errno(ret);
-                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+                        goto cleanup;
-                if (!ret && !xbs.not_found) {
-                        /*
-                         * If succeed and that extended attribute existing in
-                         * external block, then we will remove it.
-                         */
-                        xi.value = NULL;
-                        xi.value_len = 0;
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-                } else if (ret == -ENOSPC) {
-                        if (di->i_xattr_loc && !xbs.xattr_bh) {
-                                ret = ocfs2_xattr_block_find(inode, name_index,
-                                                             name, &xbs);
-                                if (ret)
-                                        goto cleanup;
-                        }
-                        /*
-                         * If no space in inode, we will set extended attribute
-                         * into external block.
-                         */
-                        ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
-                        if (ret)
-                                goto cleanup;
-                        if (!xis.not_found) {
-                                /*
-                                 * If succeed and that extended attribute
-                                 * existing in inode, we will remove it.
-                                 */
-                                xi.value = NULL;
-                                xi.value_len = 0;
-                                ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
-                        }
                }
        }
+        mutex_unlock(&tl_inode->i_mutex);
+        ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+                                        &xbs, &ctxt, &credits);
+        if (ret) {
+                mlog_errno(ret);
+                goto cleanup;
+        }
+        /* we need to update inode's ctime field, so add credit for it. */
+        credits += OCFS2_INODE_UPDATE_CREDITS;
+        ctxt.handle = ocfs2_start_trans(osb, credits);
+        if (IS_ERR(ctxt.handle)) {
+                ret = PTR_ERR(ctxt.handle);
+                mlog_errno(ret);
+                goto cleanup;
+        }
+        ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+        ocfs2_commit_trans(osb, ctxt.handle);
+        if (ctxt.data_ac)
+                ocfs2_free_alloc_context(ctxt.data_ac);
+        if (ctxt.meta_ac)
+                ocfs2_free_alloc_context(ctxt.meta_ac);
+        if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+                ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
 cleanup:
        up_write(&OCFS2_I(inode)->ip_xattr_sem);
        ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
        brelse(di_bh);
        brelse(xbs.xattr_bh);
-        for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(xbs.bucket);
-                brelse(xbs.bucket.bhs[i]);
        return ret;
 }
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
                                void *para);
 static int ocfs2_find_xe_in_bucket(struct inode *inode,
-                                   struct buffer_head *header_bh,
+                                   struct ocfs2_xattr_bucket *bucket,
                                   int name_index,
                                   const char *name,
                                   u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
                                   int *found)
 {
        int i, ret = 0, cmp = 1, block_off, new_offset;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
        size_t name_len = strlen(name);
        struct ocfs2_xattr_entry *xe = NULL;
-        struct buffer_head *name_bh = NULL;
        char *xe_name;
        /*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
                        break;
                }
-                ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
-                                       &name_bh);
-                if (ret) {
-                        mlog_errno(ret);
-                        break;
-                }
-                xe_name = name_bh->b_data + new_offset;
-                cmp = memcmp(name, xe_name, name_len);
+                xe_name = bucket_block(bucket, block_off) + new_offset;
-                brelse(name_bh);
+                if (!memcmp(name, xe_name, name_len)) {
-                name_bh = NULL;
-                if (cmp == 0) {
                        *xe_index = i;
                        *found = 1;
                        ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                                   struct ocfs2_xattr_search *xs)
 {
        int ret, found = 0;
-        struct buffer_head *bh = NULL;
-        struct buffer_head *lower_bh = NULL;
        struct ocfs2_xattr_header *xh = NULL;
        struct ocfs2_xattr_entry *xe = NULL;
        u16 index = 0;
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        int low_bucket = 0, bucket, high_bucket;
+        struct ocfs2_xattr_bucket *search;
        u32 last_hash;
-        u64 blkno;
+        u64 blkno, lower_blkno = 0;
-        ret = ocfs2_read_block(inode, p_blkno, &bh);
+        search = ocfs2_xattr_bucket_new(inode);
+        if (!search) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_read_xattr_bucket(search, p_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        xh = bucket_xh(search);
        high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
        while (low_bucket <= high_bucket) {
-                brelse(bh);
+                ocfs2_xattr_bucket_relse(search);
-                bh = NULL;
-                bucket = (low_bucket + high_bucket) / 2;
+                bucket = (low_bucket + high_bucket) / 2;
                blkno = p_blkno + bucket * blk_per_bucket;
+                ret = ocfs2_read_xattr_bucket(search, blkno);
-                ret = ocfs2_read_block(inode, blkno, &bh);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                xh = (struct ocfs2_xattr_header *)bh->b_data;
+                xh = bucket_xh(search);
                xe = &xh->xh_entries[0];
                if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
                        high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                last_hash = le32_to_cpu(xe->xe_name_hash);
-                /* record lower_bh which may be the insert place. */
+                /* record lower_blkno which may be the insert place. */
-                brelse(lower_bh);
+                lower_blkno = blkno;
-                lower_bh = bh;
-                bh = NULL;
                if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
                        low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
                }
                /* the searched xattr should reside in this bucket if exists. */
-                ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+                ret = ocfs2_find_xe_in_bucket(inode, search,
                                              name_index, name, name_hash,
                                              &index, &found);
                if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
         * When the xattr's hash value is in the gap of 2 buckets, we will
         * always set it to the previous bucket.
         */
-        if (!lower_bh) {
+        if (!lower_blkno)
-                /*
+                lower_blkno = p_blkno;
-                 * We can't find any bucket whose first name_hash is less
-                 * than the find name_hash.
+        /* This should be in cache - we just read it during the search */
-                 */
+        ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
-                BUG_ON(bh->b_blocknr != p_blkno);
+        if (ret) {
-                lower_bh = bh;
+                mlog_errno(ret);
-                bh = NULL;
+                goto out;
        }
-        xs->bucket.bhs[0] = lower_bh;
-        xs->bucket.xh = (struct ocfs2_xattr_header *)
-                                        xs->bucket.bhs[0]->b_data;
-        lower_bh = NULL;
-        xs->header = xs->bucket.xh;
+        xs->header = bucket_xh(xs->bucket);
-        xs->base = xs->bucket.bhs[0]->b_data;
+        xs->base = bucket_block(xs->bucket, 0);
        xs->end = xs->base + inode->i_sb->s_blocksize;
        if (found) {
-                /*
-                 * If we have found the xattr enty, read all the blocks in
-                 * this bucket.
-                 */
-                ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
-                                        0);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out;
-                }
                xs->here = &xs->header->xh_entries[index];
                mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-                     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+                     (unsigned long long)bucket_blkno(xs->bucket), index);
        } else
                ret = -ENODATA;
 out:
-        brelse(bh);
+        ocfs2_xattr_bucket_free(search);
-        brelse(lower_bh);
        return ret;
 }
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
                                       xattr_bucket_func *func,
                                       void *para)
 {
-        int i, j, ret = 0;
+        int i, ret = 0;
-        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
        u32 num_buckets = clusters * bpc;
-        struct ocfs2_xattr_bucket bucket;
+        struct ocfs2_xattr_bucket *bucket;
-        memset(&bucket, 0, sizeof(bucket));
+        bucket = ocfs2_xattr_bucket_new(inode);
+        if (!bucket) {
+                mlog_errno(-ENOMEM);
+                return -ENOMEM;
+        }
        mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
             clusters, (unsigned long long)blkno);
-        for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+        for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
-                ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+                ret = ocfs2_read_xattr_bucket(bucket, blkno);
-                                        bucket.bhs, 0);
                if (ret) {
                        mlog_errno(ret);
-                        goto out;
+                        break;
                }
-                bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
                /*
                 * The real bucket num in this series of blocks is stored
                 * in the 1st bucket.
                 */
                if (i == 0)
-                        num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+                        num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
                mlog(0, "iterating xattr bucket %llu, first hash %u\n",
                     (unsigned long long)blkno,
-                     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+                     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
                if (func) {
-                        ret = func(inode, &bucket, para);
+                        ret = func(inode, bucket, para);
-                        if (ret) {
+                        if (ret)
                                mlog_errno(ret);
-                                break;
+                        /* Fall through to bucket_relse() */
-                        }
                }
-                for (j = 0; j < blk_per_bucket; j++)
+                ocfs2_xattr_bucket_relse(bucket);
-                        brelse(bucket.bhs[j]);
+                if (ret)
-                memset(&bucket, 0, sizeof(bucket));
+                        break;
        }
-out:
+        ocfs2_xattr_bucket_free(bucket);
-        for (j = 0; j < blk_per_bucket; j++)
-                brelse(bucket.bhs[j]);
        return ret;
 }
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
        int i, block_off, new_offset;
        const char *prefix, *name;
-        for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+        for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
-                struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+                struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
                type = ocfs2_xattr_get_type(entry);
                prefix = ocfs2_xattr_prefix(type);
                if (prefix) {
                        ret = ocfs2_xattr_bucket_get_name_value(inode,
-                                                                bucket->xh,
+                                                                bucket_xh(bucket),
                                                                i,
                                                                &block_off,
                                                                &new_offset);
                        if (ret)
                                break;
-                        name = (const char *)bucket->bhs[block_off]->b_data +
+                        name = (const char *)bucket_block(bucket, block_off) +
                                new_offset;
                        ret = ocfs2_xattr_list_entry(xl->buffer,
                                                     xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
 /*
 * When the ocfs2_xattr_block is filled up, new bucket will be created
 * and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end.  This is why *target starts as the last buffer.
 * Note: we need to sort the entries since they are not saved in order
 * in the ocfs2_xattr_block.
 */
 static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
                                           struct buffer_head *xb_bh,
-                                           struct buffer_head *xh_bh,
+                                           struct ocfs2_xattr_bucket *bucket)
-                                           struct buffer_head *data_bh)
 {
        int i, blocksize = inode->i_sb->s_blocksize;
+        int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u16 offset, size, off_change;
        struct ocfs2_xattr_entry *xe;
        struct ocfs2_xattr_block *xb =
                                (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                                (struct ocfs2_xattr_header *)xh_bh->b_data;
        u16 count = le16_to_cpu(xb_xh->xh_count);
-        char *target = xh_bh->b_data, *src = xb_bh->b_data;
+        char *src = xb_bh->b_data;
+        char *target = bucket_block(bucket, blks - 1);
        mlog(0, "cp xattr from block %llu to bucket %llu\n",
             (unsigned long long)xb_bh->b_blocknr,
-             (unsigned long long)xh_bh->b_blocknr);
+             (unsigned long long)bucket_blkno(bucket));
+        for (i = 0; i < blks; i++)
+                memset(bucket_block(bucket, i), 0, blocksize);
-        memset(xh_bh->b_data, 0, blocksize);
-        if (data_bh)
-                memset(data_bh->b_data, 0, blocksize);
        /*
         * Since the xe_name_offset is based on ocfs2_xattr_header,
         * there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        size = blocksize - offset;
        /* copy all the names and values. */
-        if (data_bh)
-                target = data_bh->b_data;
        memcpy(target + offset, src + offset, size);
        /* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
        xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
        /* copy all the entries. */
-        target = xh_bh->b_data;
+        target = bucket_block(bucket, 0);
        offset = offsetof(struct ocfs2_xattr_header, xh_entries);
        size = count * sizeof(struct ocfs2_xattr_entry);
        memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 * While if the entry is in index b-tree, "bucket" indicates the
 * real place of the xattr.
 */
-static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
-                                           struct ocfs2_xattr_search *xs,
+                                            struct ocfs2_xattr_search *xs,
-                                           struct buffer_head *old_bh,
+                                            struct buffer_head *old_bh)
-                                           struct buffer_head *new_bh)
 {
-        int ret = 0;
        char *buf = old_bh->b_data;
        struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
        struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
-        int i, blocksize = inode->i_sb->s_blocksize;
+        int i;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        xs->bucket.bhs[0] = new_bh;
-        get_bh(new_bh);
-        xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
-        xs->header = xs->bucket.xh;
-        xs->base = new_bh->b_data;
+        xs->header = bucket_xh(xs->bucket);
+        xs->base = bucket_block(xs->bucket, 0);
        xs->end = xs->base + inode->i_sb->s_blocksize;
-        if (!xs->not_found) {
+        if (xs->not_found)
-                if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+                return;
-                        ret = ocfs2_read_blocks(inode,
-                                        xs->bucket.bhs[0]->b_blocknr + 1,
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
-                                        0);
-                        if (ret) {
-                                mlog_errno(ret);
-                                return ret;
-                        }
-                }
-                i = xs->here - old_xh->xh_entries;
-                xs->here = &xs->header->xh_entries[i];
-        }
-        return ret;
+        i = xs->here - old_xh->xh_entries;
+        xs->here = &xs->header->xh_entries[i];
 }
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-                                          struct ocfs2_xattr_search *xs)
+                                          struct ocfs2_xattr_search *xs,
+                                          struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int ret, credits = OCFS2_SUBALLOC_ALLOC;
+        int ret;
        u32 bit_off, len;
        u64 blkno;
-        handle_t *handle;
+        handle_t *handle = ctxt->handle;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
-        struct ocfs2_alloc_context *data_ac;
-        struct buffer_head *xh_bh = NULL, *data_bh = NULL;
        struct buffer_head *xb_bh = xs->xattr_bh;
        struct ocfs2_xattr_block *xb =
                        (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_tree_root *xr;
        u16 xb_flags = le16_to_cpu(xb->xb_flags);
-        u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        mlog(0, "create xattr index block for %llu\n",
             (unsigned long long)xb_bh->b_blocknr);
        BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+        BUG_ON(!xs->bucket);
-        ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
        /*
         * XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
         */
        down_write(&oi->ip_alloc_sem);
-        /*
+        ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
-         * 3 more credits, one for xattr block update, one for the 1st block
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-         * of the new xattr bucket and one for the value/data.
-         */
-        credits += 3;
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                goto out_sem;
-        }
-        ret = ocfs2_journal_access(handle, inode, xb_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+                                     1, 1, &bit_off, &len);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
        /*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        mlog(0, "allocate 1 cluster from %llu to xattr block\n",
             (unsigned long long)blkno);
-        xh_bh = sb_getblk(inode->i_sb, blkno);
+        ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
-        if (!xh_bh) {
+        if (ret) {
-                ret = -EIO;
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
        }
-        ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+                                                OCFS2_JOURNAL_ACCESS_CREATE);
-        ret = ocfs2_journal_access(handle, inode, xh_bh,
-                                   OCFS2_JOURNAL_ACCESS_CREATE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                goto out;
-        }
-        if (bpb > 1) {
-                data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
-                if (!data_bh) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
-                ocfs2_set_new_buffer_uptodate(inode, data_bh);
-                ret = ocfs2_journal_access(handle, inode, data_bh,
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto out_commit;
-                }
        }
-        ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+        ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        ocfs2_journal_dirty(handle, xh_bh);
+        ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
-        if (data_bh)
-                ocfs2_journal_dirty(handle, data_bh);
-        ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
        /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
        memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
        xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
-        ret = ocfs2_journal_dirty(handle, xb_bh);
+        ocfs2_journal_dirty(handle, xb_bh);
-        if (ret) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-out_commit:
-        ocfs2_commit_trans(osb, handle);
-out_sem:
-        up_write(&oi->ip_alloc_sem);
 out:
-        if (data_ac)
+        up_write(&oi->ip_alloc_sem);
-                ocfs2_free_alloc_context(data_ac);
-        brelse(xh_bh);
-        brelse(data_bh);
        return ret;
 }
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
 * so that we can spare some space for insertion.
 */
 static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+                                     handle_t *handle,
                                     struct ocfs2_xattr_bucket *bucket)
 {
        int ret, i;
        size_t end, offset, len, value_len;
        struct ocfs2_xattr_header *xh;
        char *entries, *buf, *bucket_buf = NULL;
-        u64 blkno = bucket->bhs[0]->b_blocknr;
+        u64 blkno = bucket_blkno(bucket);
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        u16 xh_free_start;
        size_t blocksize = inode->i_sb->s_blocksize;
-        handle_t *handle;
-        struct buffer_head **bhs;
        struct ocfs2_xattr_entry *xe;
-        bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-                        GFP_NOFS);
-        if (!bhs)
-                return -ENOMEM;
-        ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
-        if (ret)
-                goto out;
        /*
         * In order to make the operation more efficient and generic,
         * we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
        }
        buf = bucket_buf;
-        for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+        for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
-                memcpy(buf, bhs[i]->b_data, blocksize);
+                memcpy(buf, bucket_block(bucket, i), blocksize);
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+        ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
+        if (ret < 0) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                ret = ocfs2_journal_access(handle, inode, bhs[i],
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto commit;
-                }
-        }
        xh = (struct ocfs2_xattr_header *)bucket_buf;
        entries = (char *)xh->xh_entries;
        xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
                        "bucket %llu\n", (unsigned long long)blkno);
        if (xh_free_start == end)
-                goto commit;
+                goto out;
        memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
        xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
             cmp_xe, swap_xe);
        buf = bucket_buf;
-        for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+        for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
-                memcpy(bhs[i]->b_data, buf, blocksize);
+                memcpy(bucket_block(bucket, i), buf, blocksize);
-                ocfs2_journal_dirty(handle, bhs[i]);
+        ocfs2_xattr_bucket_journal_dirty(handle, bucket);
-        }
-commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-        if (bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(bhs[i]);
-        }
-        kfree(bhs);
        kfree(bucket_buf);
        return ret;
 }
 /*
- * Move half nums of the xattr bucket in the previous cluster to this new
+ * prev_blkno points to the start of an existing extent.  new_blkno
- * cluster. We only touch the last cluster of the previous extend record.
+ * points to a newly allocated extent.  Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary.  So we take the last cluster of the existing
+ * extent and split it down the middle.  We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count.  header_bh is the bucket were we were hoping
+ * to insert our xattr.  If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
 *
- * first_bh is the first buffer_head of a series of bucket in the same
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
- * extent rec and header_bh is the header of one bucket in this cluster.
- * They will be updated if we move the data header_bh contains to the new
- * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
 */
 static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
                                               handle_t *handle,
-                                               struct buffer_head **first_bh,
+                                               struct ocfs2_xattr_bucket *first,
-                                               struct buffer_head **header_bh,
+                                               struct ocfs2_xattr_bucket *target,
                                               u64 new_blkno,
-                                               u64 prev_blkno,
                                               u32 num_clusters,
                                               u32 *first_hash)
 {
-        int i, ret, credits;
+        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct super_block *sb = inode->i_sb;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
-        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+        int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
-        int blocksize = inode->i_sb->s_blocksize;
+        int to_move = num_buckets / 2;
-        struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+        u64 src_blkno;
-        struct ocfs2_xattr_header *new_xh;
+        u64 last_cluster_blkno = bucket_blkno(first) +
-        struct ocfs2_xattr_header *xh =
+                ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
-                        (struct ocfs2_xattr_header *)((*first_bh)->b_data);
-        BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
-        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
-        prev_bh = *first_bh;
-        get_bh(prev_bh);
-        xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
-        prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+        BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+        BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
        mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-             (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
+             (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
-        /*
+        ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
-         * We need to update the 1st half of the new cluster and
+                                     last_cluster_blkno, new_blkno,
-         * 1 more for the update of the 1st bucket of the previous
+                                     to_move, first_hash);
-         * extent record.
-         */
-        credits = bpc / 2 + 1;
-        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, prev_bh,
+        /* This is the first bucket that got moved */
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+        src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
-        if (ret) {
-                mlog_errno(ret);
-                goto out;
-        }
-        for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+        /*
-                old_bh = new_bh = NULL;
+         * If the target bucket was part of the moved buckets, we need to
-                new_bh = sb_getblk(inode->i_sb, new_blkno);
+         * update first and target.
-                if (!new_bh) {
+         */
-                        ret = -EIO;
+        if (bucket_blkno(target) >= src_blkno) {
-                        mlog_errno(ret);
+                /* Find the block for the new target bucket */
-                        goto out;
+                src_blkno = new_blkno +
-                }
+                        (bucket_blkno(target) - src_blkno);
-                ocfs2_set_new_buffer_uptodate(inode, new_bh);
+                ocfs2_xattr_bucket_relse(first);
+                ocfs2_xattr_bucket_relse(target);
-                ret = ocfs2_journal_access(handle, inode, new_bh,
+                /*
-                                           OCFS2_JOURNAL_ACCESS_CREATE);
+                 * These shouldn't fail - the buffers are in the
-                if (ret < 0) {
+                 * journal from ocfs2_cp_xattr_bucket().
+                 */
+                ret = ocfs2_read_xattr_bucket(first, new_blkno);
+                if (ret) {
                        mlog_errno(ret);
-                        brelse(new_bh);
                        goto out;
                }
+                ret = ocfs2_read_xattr_bucket(target, src_blkno);
-                ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+                if (ret)
-                if (ret < 0) {
                        mlog_errno(ret);
-                        brelse(new_bh);
-                        goto out;
-                }
-                memcpy(new_bh->b_data, old_bh->b_data, blocksize);
-                if (i == 0) {
-                        new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
-                        new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
-                        if (first_hash)
-                                *first_hash = le32_to_cpu(
-                                        new_xh->xh_entries[0].xe_name_hash);
-                        new_first_bh = new_bh;
-                        get_bh(new_first_bh);
-                }
-                ocfs2_journal_dirty(handle, new_bh);
-                if (*header_bh == old_bh) {
-                        brelse(*header_bh);
-                        *header_bh = new_bh;
-                        get_bh(*header_bh);
-                        brelse(*first_bh);
-                        *first_bh = new_first_bh;
-                        get_bh(*first_bh);
-                }
-                brelse(new_bh);
-                brelse(old_bh);
        }
-        le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
-        ocfs2_journal_dirty(handle, prev_bh);
 out:
-        brelse(prev_bh);
-        brelse(new_first_bh);
-        return ret;
-}
-static int ocfs2_read_xattr_bucket(struct inode *inode,
-                                   u64 blkno,
-                                   struct buffer_head **bhs,
-                                   int new)
-{
-        int ret = 0;
-        u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        if (!new)
-                return ocfs2_read_blocks(inode, blkno,
-                                         blk_per_bucket, bhs, 0);
-        for (i = 0; i < blk_per_bucket; i++) {
-                bhs[i] = sb_getblk(inode->i_sb, blkno + i);
-                if (bhs[i] == NULL) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        break;
-                }
-                ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-        }
        return ret;
 }
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 {
        int ret, i;
        int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
-        struct buffer_head **s_bhs, **t_bhs = NULL;
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
        int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        mlog(0, "move some of xattrs from bucket %llu to %llu\n",
             (unsigned long long)blk, (unsigned long long)new_blk);
-        s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        s_bucket = ocfs2_xattr_bucket_new(inode);
-        if (!s_bhs)
+        t_bucket = ocfs2_xattr_bucket_new(inode);
-                return -ENOMEM;
+        if (!s_bucket || !t_bucket) {
+                ret = -ENOMEM;
-        ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
-        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+        ret = ocfs2_read_xattr_bucket(s_bucket, blk);
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+        ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
-        if (!t_bhs) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = -ENOMEM;
+        if (ret) {
+                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+        /*
+         * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
+         * there's no need to read it.
+         */
+        ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
+        /*
-                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+         * Hey, if we're overwriting t_bucket, what difference does
-                                           new_bucket_head ?
+         * ACCESS_CREATE vs ACCESS_WRITE make?  See the comment in the
-                                           OCFS2_JOURNAL_ACCESS_CREATE :
+         * same part of ocfs2_cp_xattr_bucket().
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+         */
-                if (ret) {
+        ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
-                        mlog_errno(ret);
+                                                new_bucket_head ?
-                        goto out;
+                                                OCFS2_JOURNAL_ACCESS_CREATE :
-                }
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        xh = bucket_xh(s_bucket);
        count = le16_to_cpu(xh->xh_count);
        start = ocfs2_xattr_find_divide_pos(xh);
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
                 * The hash value is set as one larger than
                 * that of the last entry in the previous bucket.
                 */
-                for (i = 0; i < blk_per_bucket; i++)
+                for (i = 0; i < t_bucket->bu_blocks; i++)
-                        memset(t_bhs[i]->b_data, 0, blocksize);
+                        memset(bucket_block(t_bucket, i), 0, blocksize);
-                xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+                xh = bucket_xh(t_bucket);
                xh->xh_free_start = cpu_to_le16(blocksize);
                xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
                le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
        }
        /* copy the whole bucket to the new first. */
-        for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
-                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
        /* update the new bucket. */
-        xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+        xh = bucket_xh(t_bucket);
        /*
         * Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
        else
                xh->xh_num_buckets = 0;
-        for (i = 0; i < blk_per_bucket; i++) {
+        ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
-                ocfs2_journal_dirty(handle, t_bhs[i]);
-                if (ret)
-                        mlog_errno(ret);
-        }
        /* store the first_hash of the new bucket. */
        if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
        if (start == count)
                goto out;
-        xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+        xh = bucket_xh(s_bucket);
        memset(&xh->xh_entries[start], 0,
               sizeof(struct ocfs2_xattr_entry) * (count - start));
        xh->xh_count = cpu_to_le16(start);
        xh->xh_free_start = cpu_to_le16(name_offset);
        xh->xh_name_value_len = cpu_to_le16(name_value_len);
-        ocfs2_journal_dirty(handle, s_bhs[0]);
+        ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
-        if (ret)
-                mlog_errno(ret);
 out:
-        if (s_bhs) {
+        ocfs2_xattr_bucket_free(s_bucket);
-                for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(t_bucket);
-                        brelse(s_bhs[i]);
-        }
-        kfree(s_bhs);
-        if (t_bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(t_bhs[i]);
-        }
-        kfree(t_bhs);
        return ret;
 }
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
                                 u64 t_blkno,
                                 int t_is_new)
 {
-        int ret, i;
+        int ret;
-        int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
-        int blocksize = inode->i_sb->s_blocksize;
-        struct buffer_head **s_bhs, **t_bhs = NULL;
        BUG_ON(s_blkno == t_blkno);
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
             (unsigned long long)s_blkno, (unsigned long long)t_blkno,
             t_is_new);
-        s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+        s_bucket = ocfs2_xattr_bucket_new(inode);
-                        GFP_NOFS);
+        t_bucket = ocfs2_xattr_bucket_new(inode);
-        if (!s_bhs)
+        if (!s_bucket || !t_bucket) {
-                return -ENOMEM;
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+        ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
        if (ret)
                goto out;
-        t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+        /*
-                        GFP_NOFS);
+         * Even if !t_is_new, we're overwriting t_bucket.  Thus,
-        if (!t_bhs) {
+         * there's no need to read it.
-                ret = -ENOMEM;
+         */
+        ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+        if (ret)
                goto out;
-        }
-        ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+        /*
+         * Hey, if we're overwriting t_bucket, what difference does
+         * ACCESS_CREATE vs ACCESS_WRITE make?  Well, if we allocated a new
+         * cluster to fill, we came here from
+         * ocfs2_mv_xattr_buckets(), and it is really new -
+         * ACCESS_CREATE is required.  But we also might have moved data
+         * out of t_bucket before extending back into it.
+         * ocfs2_add_new_xattr_bucket() can do this - its call to
+         * ocfs2_add_new_xattr_cluster() may have created a new extent
+         * and copied out the end of the old extent.  Then it re-extends
+         * the old extent back to create space for new xattrs.  That's
+         * how we get here, and the bucket isn't really new.
+         */
+        ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+                                                t_is_new ?
+                                                OCFS2_JOURNAL_ACCESS_CREATE :
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret)
                goto out;
-        for (i = 0; i < blk_per_bucket; i++) {
+        ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
-                ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+        ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
-                                           t_is_new ?
-                                           OCFS2_JOURNAL_ACCESS_CREATE :
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret)
-                        goto out;
-        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
-                ocfs2_journal_dirty(handle, t_bhs[i]);
-        }
 out:
-        if (s_bhs) {
+        ocfs2_xattr_bucket_free(t_bucket);
-                for (i = 0; i < blk_per_bucket; i++)
+        ocfs2_xattr_bucket_free(s_bucket);
-                        brelse(s_bhs[i]);
-        }
-        kfree(s_bhs);
-        if (t_bhs) {
-                for (i = 0; i < blk_per_bucket; i++)
-                        brelse(t_bhs[i]);
-        }
-        kfree(t_bhs);
        return ret;
 }
 /*
- * Copy one xattr cluster from src_blk to to_blk.
+ * src_blk points to the start of an existing extent.  last_blk points to
- * The to_blk will become the first bucket header of the cluster, so its
+ * last cluster in that extent.  to_blk points to a newly allocated
- * xh_num_buckets will be initialized as the bucket num in the cluster.
+ * extent.  We copy the buckets from the cluster at last_blk to the new
+ * extent.  If start_bucket is non-zero, we skip that many buckets before
+ * we start copying.  The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied.  The old extent's xh_num_buckets shrinks
+ * by the same amount.
 */
-static int ocfs2_cp_xattr_cluster(struct inode *inode,
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
-                                  handle_t *handle,
+                                  u64 src_blk, u64 last_blk, u64 to_blk,
-                                  struct buffer_head *first_bh,
+                                  unsigned int start_bucket,
-                                  u64 src_blk,
-                                  u64 to_blk,
                                  u32 *first_hash)
 {
        int i, ret, credits;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+        int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
-        struct buffer_head *bh = NULL;
+        struct ocfs2_xattr_bucket *old_first, *new_first;
-        struct ocfs2_xattr_header *xh;
-        u64 to_blk_start = to_blk;
+        mlog(0, "mv xattrs from cluster %llu to %llu\n",
+             (unsigned long long)last_blk, (unsigned long long)to_blk);
+        BUG_ON(start_bucket >= num_buckets);
+        if (start_bucket) {
+                num_buckets -= start_bucket;
+                last_blk += (start_bucket * blks_per_bucket);
+        }
+        /* The first bucket of the original extent */
+        old_first = ocfs2_xattr_bucket_new(inode);
+        /* The first bucket of the new extent */
+        new_first = ocfs2_xattr_bucket_new(inode);
+        if (!old_first || !new_first) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
-        mlog(0, "cp xattrs from cluster %llu to %llu\n",
+        ret = ocfs2_read_xattr_bucket(old_first, src_blk);
-             (unsigned long long)src_blk, (unsigned long long)to_blk);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
        /*
-         * We need to update the new cluster and 1 more for the update of
+         * We need to update the first bucket of the old extent and all
-         * the 1st bucket of the previous extent rec.
+         * the buckets going to the new extent.
         */
-        credits = bpc + 1;
+        credits = ((num_buckets + 1) * blks_per_bucket) +
+                handle->h_buffer_credits;
        ret = ocfs2_extend_trans(handle, credits);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, first_bh,
+        ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
        for (i = 0; i < num_buckets; i++) {
                ret = ocfs2_cp_xattr_bucket(inode, handle,
-                                            src_blk, to_blk, 1);
+                                            last_blk + (i * blks_per_bucket),
+                                            to_blk + (i * blks_per_bucket),
+                                            1);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-                to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        }
-        /* update the old bucket header. */
+        /*
-        xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+         * Get the new bucket ready before we dirty anything
-        le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+         * (This actually shouldn't fail, because we already dirtied
+         * it once in ocfs2_cp_xattr_bucket()).
-        ocfs2_journal_dirty(handle, first_bh);
+         */
+        ret = ocfs2_read_xattr_bucket(new_first, to_blk);
-        /* update the new bucket header. */
+        if (ret) {
-        ret = ocfs2_read_block(inode, to_blk_start, &bh);
-        if (ret < 0) {
                mlog_errno(ret);
                goto out;
        }
+        ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
-        ret = ocfs2_journal_access(handle, inode, bh,
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        xh = (struct ocfs2_xattr_header *)bh->b_data;
+        /* Now update the headers */
-        xh->xh_num_buckets = cpu_to_le16(num_buckets);
+        le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+        ocfs2_xattr_bucket_journal_dirty(handle, old_first);
-        ocfs2_journal_dirty(handle, bh);
+        bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+        ocfs2_xattr_bucket_journal_dirty(handle, new_first);
        if (first_hash)
-                *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+                *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
 out:
-        brelse(bh);
+        ocfs2_xattr_bucket_free(new_first);
+        ocfs2_xattr_bucket_free(old_first);
        return ret;
 }
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
                                      u32 *first_hash)
 {
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        int ret, credits = 2 * blk_per_bucket;
+        int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
        BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
 */
 static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
                                            handle_t *handle,
-                                            struct buffer_head **first_bh,
+                                            struct ocfs2_xattr_bucket *first,
-                                            struct buffer_head **header_bh,
+                                            struct ocfs2_xattr_bucket *target,
                                            u64 new_blk,
-                                            u64 prev_blk,
                                            u32 prev_clusters,
                                            u32 *v_start,
                                            int *extend)
 {
-        int ret = 0;
+        int ret;
-        int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-             (unsigned long long)prev_blk, prev_clusters,
+             (unsigned long long)bucket_blkno(first), prev_clusters,
             (unsigned long long)new_blk);
-        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+        if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
                ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
                                                          handle,
-                                                          first_bh,
+                                                          first, target,
-                                                          header_bh,
                                                          new_blk,
-                                                          prev_blk,
                                                          prev_clusters,
                                                          v_start);
-        else {
+                if (ret)
-                u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+                        mlog_errno(ret);
+        } else {
-                if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+                /* The start of the last cluster in the first extent */
-                        ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+                u64 last_blk = bucket_blkno(first) +
-                                                     last_blk, new_blk,
+                        ((prev_clusters - 1) *
+                         ocfs2_clusters_to_blocks(inode->i_sb, 1));
+                if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+                        ret = ocfs2_mv_xattr_buckets(inode, handle,
+                                                     bucket_blkno(first),
+                                                     last_blk, new_blk, 0,
                                                     v_start);
-                else {
+                        if (ret)
+                                mlog_errno(ret);
+                } else {
                        ret = ocfs2_divide_xattr_cluster(inode, handle,
                                                         last_blk, new_blk,
                                                         v_start);
+                        if (ret)
+                                mlog_errno(ret);
-                        if ((*header_bh)->b_blocknr == last_blk && extend)
+                        if ((bucket_blkno(target) == last_blk) && extend)
                                *extend = 0;
                }
        }
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 */
 static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                                       struct buffer_head *root_bh,
-                                       struct buffer_head **first_bh,
+                                       struct ocfs2_xattr_bucket *first,
-                                       struct buffer_head **header_bh,
+                                       struct ocfs2_xattr_bucket *target,
                                       u32 *num_clusters,
                                       u32 prev_cpos,
-                                       u64 prev_blkno,
+                                       int *extend,
-                                       int *extend)
+                                       struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        int ret, credits;
+        int ret;
        u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
        u32 prev_clusters = *num_clusters;
        u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
        u64 block;
-        handle_t *handle = NULL;
+        handle_t *handle = ctxt->handle;
-        struct ocfs2_alloc_context *data_ac = NULL;
-        struct ocfs2_alloc_context *meta_ac = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
        mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
             "previous xattr blkno = %llu\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno,
-             prev_cpos, (unsigned long long)prev_blkno);
+             prev_cpos, (unsigned long long)bucket_blkno(first));
        ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
-        ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+        ret = ocfs2_journal_access_xb(handle, inode, root_bh,
-                                    &data_ac, &meta_ac);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret) {
-                mlog_errno(ret);
-                goto leave;
-        }
-        credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-                                            clusters_to_add);
-        handle = ocfs2_start_trans(osb, credits);
-        if (IS_ERR(handle)) {
-                ret = PTR_ERR(handle);
-                handle = NULL;
-                mlog_errno(ret);
-                goto leave;
-        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret < 0) {
                mlog_errno(ret);
                goto leave;
        }
-        ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+        ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
                                     clusters_to_add, &bit_off, &num_bits);
        if (ret < 0) {
                if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
             num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-        if (prev_blkno + prev_clusters * bpc == block &&
+        if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
            (prev_clusters + num_bits) << osb->s_clustersize_bits <=
             OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
                /*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
        } else {
                ret = ocfs2_adjust_xattr_cross_cluster(inode,
                                                       handle,
-                                                       first_bh,
+                                                       first,
-                                                       header_bh,
+                                                       target,
                                                       block,
-                                                       prev_blkno,
                                                       prev_clusters,
                                                       &v_start,
                                                       extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
                }
        }
-        if (handle->h_buffer_credits < credits) {
-                /*
-                 * The journal has been restarted before, and don't
-                 * have enough space for the insertion, so extend it
-                 * here.
-                 */
-                ret = ocfs2_extend_trans(handle, credits);
-                if (ret) {
-                        mlog_errno(ret);
-                        goto leave;
-                }
-        }
        mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
             num_bits, (unsigned long long)block, v_start);
        ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
-                                  num_bits, 0, meta_ac);
+                                  num_bits, 0, ctxt->meta_ac);
        if (ret < 0) {
                mlog_errno(ret);
                goto leave;
        }
        ret = ocfs2_journal_dirty(handle, root_bh);
-        if (ret < 0) {
+        if (ret < 0)
                mlog_errno(ret);
-                goto leave;
-        }
 leave:
-        if (handle)
-                ocfs2_commit_trans(osb, handle);
-        if (data_ac)
-                ocfs2_free_alloc_context(data_ac);
-        if (meta_ac)
-                ocfs2_free_alloc_context(meta_ac);
        return ret;
 }
 /*
- * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We are given an extent.  'first' is the bucket at the very front of
- * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ * the extent.  The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets.  'target_blkno' is the block number
+ * of the target bucket.  We wish to shift every bucket past the target
+ * down one, filling in that additional space.  When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
 */
 static int ocfs2_extend_xattr_bucket(struct inode *inode,
-                                     struct buffer_head *first_bh,
+                                     handle_t *handle,
-                                     struct buffer_head *start_bh,
+                                     struct ocfs2_xattr_bucket *first,
+                                     u64 target_blk,
                                     u32 num_clusters)
 {
        int ret, credits;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        u64 start_blk = start_bh->b_blocknr, end_blk;
+        u64 end_blk;
-        u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+        u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
-        handle_t *handle;
-        struct ocfs2_xattr_header *first_xh =
-                                (struct ocfs2_xattr_header *)first_bh->b_data;
-        u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
        mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-             "from %llu, len = %u\n", (unsigned long long)start_blk,
+             "from %llu, len = %u\n", (unsigned long long)target_blk,
-             (unsigned long long)first_bh->b_blocknr, num_clusters);
+             (unsigned long long)bucket_blkno(first), num_clusters);
-        BUG_ON(bucket >= num_buckets);
+        /* The extent must have room for an additional bucket */
+        BUG_ON(new_bucket >=
+               (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
-        end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+        /* end_blk points to the last existing bucket */
+        end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
        /*
-         * We will touch all the buckets after the start_bh(include it).
+         * end_blk is the start of the last existing bucket.
-         * Add one more bucket and modify the first_bh.
+         * Thus, (end_blk - target_blk) covers the target bucket and
+         * every bucket after it up to, but not including, the last
+         * existing bucket.  Then we add the last existing bucket, the
+         * new bucket, and the first bucket (3 * blk_per_bucket).
         */
-        credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+        credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
-        handle = ocfs2_start_trans(osb, credits);
+                  handle->h_buffer_credits;
-        if (IS_ERR(handle)) {
+        ret = ocfs2_extend_trans(handle, credits);
-                ret = PTR_ERR(handle);
+        if (ret) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, first_bh,
+        ret = ocfs2_xattr_bucket_journal_access(handle, first,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto commit;
+                goto out;
        }
-        while (end_blk != start_blk) {
+        while (end_blk != target_blk) {
                ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
                                            end_blk + blk_per_bucket, 0);
                if (ret)
-                        goto commit;
+                        goto out;
                end_blk -= blk_per_bucket;
        }
-        /* Move half of the xattr in start_blk to the next bucket. */
+        /* Move half of the xattr in target_blkno to the next bucket. */
-        ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
+        ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
-                                        start_blk + blk_per_bucket, NULL, 0);
+                                        target_blk + blk_per_bucket, NULL, 0);
-        le16_add_cpu(&first_xh->xh_num_buckets, 1);
+        le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
-        ocfs2_journal_dirty(handle, first_bh);
+        ocfs2_xattr_bucket_journal_dirty(handle, first);
-commit:
-        ocfs2_commit_trans(osb, handle);
 out:
        return ret;
 }
 /*
- * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * Add new xattr bucket in an extent record and adjust the buckets
- * xb_bh is the ocfs2_xattr_block.
+ * accordingly.  xb_bh is the ocfs2_xattr_block, and target is the
- * We will move all the buckets starting from header_bh to the next place. As
+ * bucket we want to insert into.
- * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
 *
- * We will allocate a new cluster if current cluster is full and adjust
+ * If current cluster is full, we'll allocate a new one.  This may not
- * header_bh and first_bh if the insert place is moved to the new cluster.
+ * be contiguous.  The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
 */
 static int ocfs2_add_new_xattr_bucket(struct inode *inode,
                                      struct buffer_head *xb_bh,
-                                      struct buffer_head *header_bh)
+                                      struct ocfs2_xattr_bucket *target,
+                                      struct ocfs2_xattr_set_ctxt *ctxt)
 {
-        struct ocfs2_xattr_header *first_xh = NULL;
-        struct buffer_head *first_bh = NULL;
        struct ocfs2_xattr_block *xb =
                        (struct ocfs2_xattr_block *)xb_bh->b_data;
        struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
        struct ocfs2_extent_list *el = &xb_root->xt_list;
-        struct ocfs2_xattr_header *xh =
+        u32 name_hash =
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
+                le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
-        u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct super_block *sb = inode->i_sb;
-        struct ocfs2_super *osb = OCFS2_SB(sb);
        int ret, num_buckets, extend = 1;
        u64 p_blkno;
        u32 e_cpos, num_clusters;
+        /* The bucket at the front of the extent */
+        struct ocfs2_xattr_bucket *first;
-        mlog(0, "Add new xattr bucket starting form %llu\n",
+        mlog(0, "Add new xattr bucket starting from %llu\n",
-             (unsigned long long)header_bh->b_blocknr);
+             (unsigned long long)bucket_blkno(target));
-        /*
+        /* The first bucket of the original extent */
-         * Add refrence for header_bh here because it may be
+        first = ocfs2_xattr_bucket_new(inode);
-         * changed in ocfs2_add_new_xattr_cluster and we need
+        if (!first) {
-         * to free it in the end.
+                ret = -ENOMEM;
-         */
+                mlog_errno(ret);
-        get_bh(header_bh);
+                goto out;
+        }
        ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
                                  &num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
                goto out;
        }
-        ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+        ret = ocfs2_read_xattr_bucket(first, p_blkno);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
        num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
-        first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+        if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+                /*
-        if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+                 * This can move first+target if the target bucket moves
+                 * to the new extent.
+                 */
                ret = ocfs2_add_new_xattr_cluster(inode,
                                                  xb_bh,
-                                                  &first_bh,
+                                                  first,
-                                                  &header_bh,
+                                                  target,
                                                  &num_clusters,
                                                  e_cpos,
-                                                  p_blkno,
+                                                  &extend,
-                                                  &extend);
+                                                  ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        }
-        if (extend)
+        if (extend) {
                ret = ocfs2_extend_xattr_bucket(inode,
-                                                first_bh,
+                                                ctxt->handle,
-                                                header_bh,
+                                                first,
+                                                bucket_blkno(target),
                                                num_clusters);
-        if (ret)
+                if (ret)
-                mlog_errno(ret);
+                        mlog_errno(ret);
+        }
 out:
-        brelse(first_bh);
+        ocfs2_xattr_bucket_free(first);
-        brelse(header_bh);
        return ret;
 }
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
        int block_off = offs >> inode->i_sb->s_blocksize_bits;
        offs = offs % inode->i_sb->s_blocksize;
-        return bucket->bhs[block_off]->b_data + offs;
+        return bucket_block(bucket, block_off) + offs;
 }
 /*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
                                xe->xe_value_size = 0;
                        val = ocfs2_xattr_bucket_get_val(inode,
-                                                         &xs->bucket, offs);
+                                                         xs->bucket, offs);
                        memset(val + OCFS2_XATTR_SIZE(name_len), 0,
                               size - OCFS2_XATTR_SIZE(name_len));
                        if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
                xh->xh_free_start = cpu_to_le16(offs);
        }
-        val = ocfs2_xattr_bucket_get_val(inode,
+        val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
-                                         &xs->bucket, offs - size);
        xe->xe_name_offset = cpu_to_le16(offs - size);
        memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
        return;
 }
-static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
-                                             handle_t *handle,
-                                             struct ocfs2_xattr_search *xs,
-                                             struct buffer_head **bhs,
-                                             u16 bh_num)
-{
-        int ret = 0, off, block_off;
-        struct ocfs2_xattr_entry *xe = xs->here;
-        /*
-         * First calculate all the blocks we should journal_access
-         * and journal_dirty. The first block should always be touched.
-         */
-        ret = ocfs2_journal_dirty(handle, bhs[0]);
-        if (ret)
-                mlog_errno(ret);
-        /* calc the data. */
-        off = le16_to_cpu(xe->xe_name_offset);
-        block_off = off >> inode->i_sb->s_blocksize_bits;
-        ret = ocfs2_journal_dirty(handle, bhs[block_off]);
-        if (ret)
-                mlog_errno(ret);
-        return ret;
-}
 /*
 * Set the xattr entry in the specified bucket.
 * The bucket is indicated by xs->bucket and it should have the enough
 * space for the xattr insertion.
 */
 static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+                                           handle_t *handle,
                                           struct ocfs2_xattr_info *xi,
                                           struct ocfs2_xattr_search *xs,
                                           u32 name_hash,
                                           int local)
 {
-        int i, ret;
+        int ret;
-        handle_t *handle = NULL;
+        u64 blkno;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
             (unsigned long)xi->value_len, xi->name_index,
-             (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+             (unsigned long long)bucket_blkno(xs->bucket));
-        if (!xs->bucket.bhs[1]) {
+        if (!xs->bucket->bu_bhs[1]) {
-                ret = ocfs2_read_blocks(inode,
+                blkno = bucket_blkno(xs->bucket);
-                                        xs->bucket.bhs[0]->b_blocknr + 1,
+                ocfs2_xattr_bucket_relse(xs->bucket);
-                                        blk_per_bucket - 1, &xs->bucket.bhs[1],
+                ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
-                                        0);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
        }
-        handle = ocfs2_start_trans(osb, blk_per_bucket);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
+        if (ret < 0) {
-                handle = NULL;
                mlog_errno(ret);
                goto out;
        }
-        for (i = 0; i < blk_per_bucket; i++) {
-                ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret < 0) {
-                        mlog_errno(ret);
-                        goto out;
-                }
-        }
        ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        /*Only dirty the blocks we have touched in set xattr. */
-        ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
-                                                xs->bucket.bhs, blk_per_bucket);
-        if (ret)
-                mlog_errno(ret);
-out:
-        ocfs2_commit_trans(osb, handle);
-        return ret;
-}
-static int ocfs2_xattr_value_update_size(struct inode *inode,
-                                         struct buffer_head *xe_bh,
-                                         struct ocfs2_xattr_entry *xe,
-                                         u64 new_size)
-{
-        int ret;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        handle_t *handle = NULL;
-        handle = ocfs2_start_trans(osb, 1);
-        if (IS_ERR(handle)) {
-                ret = -ENOMEM;
-                mlog_errno(ret);
-                goto out;
-        }
-        ret = ocfs2_journal_access(handle, inode, xe_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
-        if (ret < 0) {
-                mlog_errno(ret);
-                goto out_commit;
-        }
-        xe->xe_value_size = cpu_to_le64(new_size);
-        ret = ocfs2_journal_dirty(handle, xe_bh);
-        if (ret < 0)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(osb, handle);
 out:
        return ret;
 }
@@ -4210,18 +4697,19 @@ out:
 * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
 */
 static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
-                                             struct buffer_head *header_bh,
+                                             struct ocfs2_xattr_bucket *bucket,
                                             int xe_off,
-                                             int len)
+                                             int len,
+                                             struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, offset;
        u64 value_blk;
-        struct buffer_head *value_bh = NULL;
-        struct ocfs2_xattr_value_root *xv;
        struct ocfs2_xattr_entry *xe;
-        struct ocfs2_xattr_header *xh =
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
-                        (struct ocfs2_xattr_header *)header_bh->b_data;
        size_t blocksize = inode->i_sb->s_blocksize;
+        struct ocfs2_xattr_value_buf vb = {
+                .vb_access = ocfs2_journal_access,
+        };
        xe = &xh->xh_entries[xe_off];
@@ -4234,49 +4722,58 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
        /* We don't allow ocfs2_xattr_value to be stored in different block. */
        BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
-        value_blk += header_bh->b_blocknr;
-        ret = ocfs2_read_block(inode, value_blk, &value_bh);
+        vb.vb_bh = bucket->bu_bhs[value_blk];
-        if (ret) {
+        BUG_ON(!vb.vb_bh);
-                mlog_errno(ret);
-                goto out;
-        }
-        xv = (struct ocfs2_xattr_value_root *)
+        vb.vb_xv = (struct ocfs2_xattr_value_root *)
-                (value_bh->b_data + offset % blocksize);
+                (vb.vb_bh->b_data + offset % blocksize);
-        mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+        ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
-             xe_off, (unsigned long long)header_bh->b_blocknr, len);
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-        ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
        if (ret) {
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+        /*
+         * From here on out we have to dirty the bucket.  The generic
+         * value calls only modify one of the bucket's bhs, but we need
+         * to send the bucket at once.  So if they error, they *could* have
+         * modified something.  We have to assume they did, and dirty
+         * the whole bucket.  This leaves us in a consistent state.
+         */
+        mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+             xe_off, (unsigned long long)bucket_blkno(bucket), len);
+        ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
        if (ret) {
                mlog_errno(ret);
-                goto out;
+                goto out_dirty;
        }
+        xe->xe_value_size = cpu_to_le64(len);
+out_dirty:
+        ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
 out:
-        brelse(value_bh);
        return ret;
 }
 static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
-                                                struct ocfs2_xattr_search *xs,
+                                        struct ocfs2_xattr_search *xs,
-                                                int len)
+                                        int len,
+                                        struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, offset;
        struct ocfs2_xattr_entry *xe = xs->here;
        struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
-        BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+        BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
        offset = xe - xh->xh_entries;
-        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+        ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
-                                                offset, len);
+                                                offset, len, ctxt);
        if (ret)
                mlog_errno(ret);
@@ -4284,6 +4781,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 }
 static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+                                                handle_t *handle,
                                                struct ocfs2_xattr_search *xs,
                                                char *val,
                                                int value_len)
@@ -4299,7 +4797,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
        xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
-        return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+        return __ocfs2_xattr_set_value_outside(inode, handle,
+                                               xv, val, value_len);
 }
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4842,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
                }
        }
-        handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+        handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
        if (IS_ERR(handle)) {
                ret = -ENOMEM;
                mlog_errno(ret);
                goto out;
        }
-        ret = ocfs2_journal_access(handle, inode, root_bh,
+        ret = ocfs2_journal_access_xb(handle, inode, root_bh,
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
+                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -4392,26 +4891,19 @@ out:
 }
 static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+                                         handle_t *handle,
                                         struct ocfs2_xattr_search *xs)
 {
-        handle_t *handle = NULL;
+        struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
-        struct ocfs2_xattr_header *xh = xs->bucket.xh;
        struct ocfs2_xattr_entry *last = &xh->xh_entries[
                                                le16_to_cpu(xh->xh_count) - 1];
        int ret = 0;
-        handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+        ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
-        if (IS_ERR(handle)) {
+                                                OCFS2_JOURNAL_ACCESS_WRITE);
-                ret = PTR_ERR(handle);
-                mlog_errno(ret);
-                return;
-        }
-        ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
-                                   OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
-                goto out_commit;
+                return;
        }
        /* Remove the old entry. */
@@ -4420,11 +4912,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
        memset(last, 0, sizeof(struct ocfs2_xattr_entry));
        le16_add_cpu(&xh->xh_count, -1);
-        ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+        ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-        if (ret < 0)
-                mlog_errno(ret);
-out_commit:
-        ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 }
 /*
@@ -4440,7 +4928,8 @@ out_commit:
 */
 static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                                     struct ocfs2_xattr_info *xi,
-                                     struct ocfs2_xattr_search *xs)
+                                     struct ocfs2_xattr_search *xs,
+                                     struct ocfs2_xattr_set_ctxt *ctxt)
 {
        int ret, local = 1;
        size_t value_len;
@@ -4468,7 +4957,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                        value_len = 0;
                ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                           value_len);
+                                                           value_len,
+                                                           ctxt);
                if (ret)
                        goto out;
@@ -4488,7 +4978,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                xi->value_len = OCFS2_XATTR_ROOT_SIZE;
        }
-        ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+        ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
+                                              name_hash, local);
        if (ret) {
                mlog_errno(ret);
                goto out;
@@ -4499,7 +4990,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
        /* allocate the space now for the outside block storage. */
        ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-                                                   value_len);
+                                                   value_len, ctxt);
        if (ret) {
                mlog_errno(ret);
@@ -4509,13 +5000,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
                         * storage and we have allocated xattr already,
                         * so need to remove it.
                         */
-                        ocfs2_xattr_bucket_remove_xs(inode, xs);
+                        ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
                }
                goto out;
        }
 set_value_outside:
-        ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+        ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
+                                                   xs, val, value_len);
 out:
        return ret;
 }
@@ -4530,7 +5022,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
                                              struct ocfs2_xattr_bucket *bucket,
                                              const char *name)
 {
-        struct ocfs2_xattr_header *xh = bucket->xh;
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
        u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
        if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5032,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
            xh->xh_entries[0].xe_name_hash) {
                mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
                     "hash = %u\n",
-                     (unsigned long long)bucket->bhs[0]->b_blocknr,
+                     (unsigned long long)bucket_blkno(bucket),
                     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
                return -ENOSPC;
        }
@@ -4550,16 +5042,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
                                             struct ocfs2_xattr_info *xi,
-                                             struct ocfs2_xattr_search *xs)
+                                             struct ocfs2_xattr_search *xs,
+                                             struct ocfs2_xattr_set_ctxt *ctxt)
 {
        struct ocfs2_xattr_header *xh;
        struct ocfs2_xattr_entry *xe;
        u16 count, header_size, xh_free_start;
-        int i, free, max_free, need, old;
+        int free, max_free, need, old;
        size_t value_size = 0, name_len = strlen(xi->name);
        size_t blocksize = inode->i_sb->s_blocksize;
        int ret, allocation = 0;
-        u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
        mlog_entry("Set xattr %s in xattr index block\n", xi->name);
@@ -4574,7 +5066,7 @@ try_again:
        mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
                        "of %u which exceed block size\n",
-                        (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+                        (unsigned long long)bucket_blkno(xs->bucket),
                        header_size);
        if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5106,13 @@ try_again:
        mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
             "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
             " %u\n", xs->not_found,
-             (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+             (unsigned long long)bucket_blkno(xs->bucket),
             free, need, max_free, le16_to_cpu(xh->xh_free_start),
             le16_to_cpu(xh->xh_name_value_len));
-        if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+        if (free < need ||
+            (xs->not_found &&
+             count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
                if (need <= max_free &&
                    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
                        /*
@@ -4626,7 +5120,8 @@ try_again:
                         * name/value will be moved, the xe shouldn't be changed
                         * in xs.
                         */
-                        ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+                        ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+                                                        xs->bucket);
                        if (ret) {
                                mlog_errno(ret);
                                goto out;
@@ -4658,7 +5153,7 @@ try_again:
                 * add a new bucket for the insert.
                 */
                ret = ocfs2_check_xattr_bucket_collision(inode,
-                                                         &xs->bucket,
+                                                         xs->bucket,
                                                         xi->name);
                if (ret) {
                        mlog_errno(ret);
@@ -4667,17 +5162,21 @@ try_again:
                ret = ocfs2_add_new_xattr_bucket(inode,
                                                 xs->xattr_bh,
-                                                 xs->bucket.bhs[0]);
+                                                 xs->bucket,
+                                                 ctxt);
                if (ret) {
                        mlog_errno(ret);
                        goto out;
                }
-                for (i = 0; i < blk_per_bucket; i++)
+                /*
-                        brelse(xs->bucket.bhs[i]);
+                 * ocfs2_add_new_xattr_bucket() will have updated
+                 * xs->bucket if it moved, but it will not have updated
-                memset(&xs->bucket, 0, sizeof(xs->bucket));
+                 * any of the other search fields.  Thus, we drop it and
+                 * re-search.  Everything should be cached, so it'll be
+                 * quick.
+                 */
+                ocfs2_xattr_bucket_relse(xs->bucket);
                ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
                                                   xi->name_index,
                                                   xi->name, xs);
@@ -4689,7 +5188,7 @@ try_again:
        }
 xattr_set:
-        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+        ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
 out:
        mlog_exit(ret);
        return ret;
@@ -4700,24 +5199,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
                                        void *para)
 {
        int ret = 0;
-        struct ocfs2_xattr_header *xh = bucket->xh;
+        struct ocfs2_xattr_header *xh = bucket_xh(bucket);
        u16 i;
        struct ocfs2_xattr_entry *xe;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+        int credits = ocfs2_remove_extent_credits(osb->sb) +
+                ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+        ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
        for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
                xe = &xh->xh_entries[i];
                if (ocfs2_xattr_is_local(xe))
                        continue;
-                ret = ocfs2_xattr_bucket_value_truncate(inode,
+                ctxt.handle = ocfs2_start_trans(osb, credits);
-                                                        bucket->bhs[0],
+                if (IS_ERR(ctxt.handle)) {
-                                                        i, 0);
+                        ret = PTR_ERR(ctxt.handle);
+                        mlog_errno(ret);
+                        break;
+                }
+                ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+                                                        i, 0, &ctxt);
+                ocfs2_commit_trans(osb, ctxt.handle);
                if (ret) {
                        mlog_errno(ret);
                        break;
                }
        }
+        ocfs2_schedule_truncate_log_flush(osb, 1);
+        ocfs2_run_deallocs(osb, &ctxt.dealloc);
        return ret;
 }
@@ -4768,6 +5284,74 @@ out:
 }
 /*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+                                        size_t list_size, const char *name,
+                                        size_t name_len)
+{
+        const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+                memcpy(list + prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+                                    void *buffer, size_t size)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+                               buffer, size);
+}
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+                                    const void *value, size_t size, int flags)
+{
+        if (strcmp(name, "") == 0)
+                return -EINVAL;
+        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+                               size, flags);
+}
+int ocfs2_init_security_get(struct inode *inode,
+                            struct inode *dir,
+                            struct ocfs2_security_xattr_info *si)
+{
+        /* check whether ocfs2 support feature xattr */
+        if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+                return -EOPNOTSUPP;
+        return security_inode_init_security(inode, dir, &si->name, &si->value,
+                                            &si->value_len);
+}
+int ocfs2_init_security_set(handle_t *handle,
+                            struct inode *inode,
+                            struct buffer_head *di_bh,
+                            struct ocfs2_security_xattr_info *si,
+                            struct ocfs2_alloc_context *xattr_ac,
+                            struct ocfs2_alloc_context *data_ac)
+{
+        return ocfs2_xattr_set_handle(handle, inode, di_bh,
+                                     OCFS2_XATTR_INDEX_SECURITY,
+                                     si->name, si->value, si->value_len, 0,
+                                     xattr_ac, data_ac);
+}
+struct xattr_handler ocfs2_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = ocfs2_xattr_security_list,
+        .get    = ocfs2_xattr_security_get,
+        .set    = ocfs2_xattr_security_set,
+};
+/*
 * 'trusted' attributes support
 */
 static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656d..5a1ebc789f7e 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
        OCFS2_XATTR_MAX
 };
+struct ocfs2_security_xattr_info {
+        int enable;
+        char *name;
+        void *value;
+        size_t value_len;
+};
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern struct xattr_handler ocfs2_xattr_security_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+                           const char *, void *, size_t);
 int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
                    size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+                           int, const char *, const void *, size_t, int,
+                           struct ocfs2_alloc_context *,
+                           struct ocfs2_alloc_context *);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+                            struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+                            struct buffer_head *,
+                            struct ocfs2_security_xattr_info *,
+                            struct ocfs2_alloc_context *,
+                            struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+                             struct ocfs2_security_xattr_info *,
+                             int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+                          int, struct ocfs2_security_xattr_info *,
+                          int *, int *, struct ocfs2_alloc_context **);
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block.  Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal.  Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+        struct buffer_head              *vb_bh;
+        ocfs2_journal_access_func       vb_access;
+        struct ocfs2_xattr_value_root   *vb_xv;
+};
 #endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f84..633e9dc972bb 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
        inode->i_mode = mode;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
-        inode->i_blocks = 0;
        inode->i_mapping->a_ops = &omfs_aops;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index c0a426d5766c..d882fd2351d6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -272,6 +272,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
                goto put_write_and_out;
        error = locks_verify_truncate(inode, NULL, length);
+        if (!error)
+                error = security_path_truncate(&path, length, 0);
        if (!error) {
                DQUOT_INIT(inode);
                error = do_truncate(path.dentry, length, 0, NULL);
@@ -329,6 +331,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
        error = locks_verify_truncate(inode, file, length);
        if (!error)
+                error = security_path_truncate(&file->f_path, length,
+                                               ATTR_MTIME|ATTR_CTIME);
+        if (!error)
                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
        fput(file);
@@ -407,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
        if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
                goto out_fput;
-        if (inode->i_op && inode->i_op->fallocate)
+        if (inode->i_op->fallocate)
                ret = inode->i_op->fallocate(inode, mode, offset, len);
        else
                ret = -EOPNOTSUPP;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de4..ffcd04f0012c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
                break;
        }
-        inode->i_gid = 0;
-        inode->i_uid = 0;
        d_add(dentry, inode);
        return NULL;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d5b213b8a9b..6d720243f5f4 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno)
        blk_free_devt(part_devt(part));
        rcu_assign_pointer(ptbl->part[partno], NULL);
+        rcu_assign_pointer(ptbl->last_lookup, NULL);
        kobject_put(part->holder_dir);
        device_del(part_to_dev(part));
@@ -384,9 +385,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        dname = dev_name(ddev);
        if (isdigit(dname[strlen(dname) - 1]))
-                snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
+                dev_set_name(pdev, "%sp%d", dname, partno);
        else
-                snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
+                dev_set_name(pdev, "%s%d", dname, partno);
        device_initialize(pdev);
        pdev->class = &block_class;
@@ -447,16 +448,11 @@ void register_disk(struct gendisk *disk)
        struct block_device *bdev;
        struct disk_part_iter piter;
        struct hd_struct *part;
-        char *s;
        int err;
        ddev->parent = disk->driverfs_dev;
-        strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
+        dev_set_name(ddev, disk->disk_name);
-        /* ewww... some of these buggers have / in the name... */
-        s = strchr(ddev->bus_id, '/');
-        if (s)
-                *s = '!';
        /* delay uevents, until we scanned partition table */
        ddev->uevent_suppress = 1;
diff --git a/fs/pipe.c b/fs/pipe.c
index aaf797bd57b9..891697112f66 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags)
                goto err_fdr;
        fdw = error;
-        error = audit_fd_pair(fdr, fdw);
+        audit_fd_pair(fdr, fdw);
-        if (error < 0)
-                goto err_fdw;
        fd_install(fdr, fr);
        fd_install(fdw, fw);
        fd[0] = fdr;
@@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
        return 0;
- err_fdw:
-        put_unused_fd(fdw);
 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b3..0c9de19a1633 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
 #include <linux/mm.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
 #include <linux/resource.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -109,25 +110,22 @@ struct pid_entry {
        .op   = OP,                                     \
 }
-#define DIR(NAME, MODE, OTYPE)                                                  \
+#define DIR(NAME, MODE, iops, fops)     \
-        NOD(NAME, (S_IFDIR|(MODE)),                                             \
+        NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
-                &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,   \
+#define LNK(NAME, get_link)                                     \
-                {} )
-#define LNK(NAME, OTYPE)                                        \
        NOD(NAME, (S_IFLNK|S_IRWXUGO),                          \
                &proc_pid_link_inode_operations, NULL,          \
-                { .proc_get_link = &proc_##OTYPE##_link } )
+                { .proc_get_link = get_link } )
-#define REG(NAME, MODE, OTYPE)                          \
+#define REG(NAME, MODE, fops)                           \
-        NOD(NAME, (S_IFREG|(MODE)), NULL,               \
+        NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
-                &proc_##OTYPE##_operations, {})
+#define INF(NAME, MODE, read)                           \
-#define INF(NAME, MODE, OTYPE)                          \
        NOD(NAME, (S_IFREG|(MODE)),                     \
                NULL, &proc_info_file_operations,       \
-                { .proc_read = &proc_##OTYPE } )
+                { .proc_read = read } )
-#define ONE(NAME, MODE, OTYPE)                          \
+#define ONE(NAME, MODE, show)                           \
        NOD(NAME, (S_IFREG|(MODE)),                     \
                NULL, &proc_single_file_operations,     \
-                { .proc_show = &proc_##OTYPE } )
+                { .proc_show = show } )
 /*
 * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
        struct mm_struct *mm = get_task_mm(task);
        if (mm) {
                unsigned int nwords = 0;
-                do
+                do {
                        nwords += 2;
-                while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+                } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
                res = nwords * sizeof(mm->saved_auxv[0]);
                if (res > PAGE_SIZE)
                        res = PAGE_SIZE;
@@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_KALLSYMS */
+#ifdef CONFIG_STACKTRACE
+#define MAX_STACK_TRACE_DEPTH   64
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+                          struct pid *pid, struct task_struct *task)
+{
+        struct stack_trace trace;
+        unsigned long *entries;
+        int i;
+        entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
+        if (!entries)
+                return -ENOMEM;
+        trace.nr_entries        = 0;
+        trace.max_entries       = MAX_STACK_TRACE_DEPTH;
+        trace.entries           = entries;
+        trace.skip              = 0;
+        save_stack_trace_tsk(task, &trace);
+        for (i = 0; i < trace.nr_entries; i++) {
+                seq_printf(m, "[<%p>] %pS\n",
+                           (void *)entries[i], (void *)entries[i]);
+        }
+        kfree(entries);
+        return 0;
+}
+#endif
 #ifdef CONFIG_SCHEDSTATS
 /*
 * Provides /proc/PID/schedstat
@@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v)
        struct inode *inode = m->private;
        struct task_struct *p;
-        WARN_ON(!inode);
        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
@@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf,
        struct inode *inode = file->f_path.dentry->d_inode;
        struct task_struct *p;
-        WARN_ON(!inode);
        p = get_proc_task(inode);
        if (!p)
                return -ESRCH;
@@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
        if (!ei->pid)
                goto out_unlock;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        if (task_dumpable(task)) {
                rcu_read_lock();
                cred = __task_cred(task);
@@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                                         const struct pid_entry *ents,
                                         unsigned int nents)
 {
-        struct inode *inode;
        struct dentry *error;
        struct task_struct *task = get_proc_task(dir);
        const struct pid_entry *p, *last;
        error = ERR_PTR(-ENOENT);
-        inode = NULL;
        if (!task)
                goto out_no_task;
@@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = {
 };
 static const struct pid_entry attr_dir_stuff[] = {
-        REG("current",    S_IRUGO|S_IWUGO, pid_attr),
+        REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("prev",       S_IRUGO,         pid_attr),
+        REG("prev",       S_IRUGO,         proc_pid_attr_operations),
-        REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
+        REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
+        REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
+        REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
-        REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
+        REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
 };
 static int proc_attr_dir_readdir(struct file * filp,
@@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        if (!ei->pid)
                goto out_iput;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        inode->i_mode = p->mode;
        if (S_ISDIR(inode->i_mode))
                inode->i_nlink = 2;
@@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations;
 static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
-        DIR("task",       S_IRUGO|S_IXUGO, task),
+        DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
-        DIR("fd",         S_IRUSR|S_IXUSR, fd),
+        DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-        DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
+        DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 #ifdef CONFIG_NET
-        DIR("net",        S_IRUGO|S_IXUGO, net),
+        DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
-        REG("environ",    S_IRUSR, environ),
+        REG("environ",    S_IRUSR, proc_environ_operations),
-        INF("auxv",       S_IRUSR, pid_auxv),
+        INF("auxv",       S_IRUSR, proc_pid_auxv),
-        ONE("status",     S_IRUGO, pid_status),
+        ONE("status",     S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, pid_personality),
+        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",     S_IRUSR, pid_limits),
+        INF("limits",     S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-        REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
+        REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",    S_IRUSR, pid_syscall),
+        INF("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
-        INF("cmdline",    S_IRUGO, pid_cmdline),
+        INF("cmdline",    S_IRUGO, proc_pid_cmdline),
-        ONE("stat",       S_IRUGO, tgid_stat),
+        ONE("stat",       S_IRUGO, proc_tgid_stat),
-        ONE("statm",      S_IRUGO, pid_statm),
+        ONE("statm",      S_IRUGO, proc_pid_statm),
-        REG("maps",       S_IRUGO, maps),
+        REG("maps",       S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-        REG("numa_maps",  S_IRUGO, numa_maps),
+        REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
 #endif
-        REG("mem",        S_IRUSR|S_IWUSR, mem),
+        REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
-        LNK("cwd",        cwd),
+        LNK("cwd",        proc_cwd_link),
-        LNK("root",       root),
+        LNK("root",       proc_root_link),
-        LNK("exe",        exe),
+        LNK("exe",        proc_exe_link),
-        REG("mounts",     S_IRUGO, mounts),
+        REG("mounts",     S_IRUGO, proc_mounts_operations),
-        REG("mountinfo",  S_IRUGO, mountinfo),
+        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
-        REG("mountstats", S_IRUSR, mountstats),
+        REG("mountstats", S_IRUSR, proc_mountstats_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-        REG("clear_refs", S_IWUSR, clear_refs),
+        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-        REG("smaps",      S_IRUGO, smaps),
+        REG("smaps",      S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, pagemap),
+        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-        DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
+        DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-        INF("wchan",      S_IRUGO, pid_wchan),
+        INF("wchan",      S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-        INF("schedstat",  S_IRUGO, pid_schedstat),
+        INF("schedstat",  S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-        REG("latency",  S_IRUGO, lstats),
+        REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-        REG("cpuset",     S_IRUGO, cpuset),
+        REG("cpuset",     S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-        REG("cgroup",  S_IRUGO, cgroup),
+        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-        INF("oom_score",  S_IRUGO, oom_score),
+        INF("oom_score",  S_IRUGO, proc_oom_score),
-        REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
+        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-        REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
+        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
-        REG("sessionid",  S_IRUGO, sessionid),
+        REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-        REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
+        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        INF("io",       S_IRUGO, tgid_io_accounting),
+        INF("io",       S_IRUGO, proc_tgid_io_accounting),
 #endif
 };
@@ -2805,66 +2827,69 @@ out_no_task:
 * Tasks
 */
 static const struct pid_entry tid_base_stuff[] = {
-        DIR("fd",        S_IRUSR|S_IXUSR, fd),
+        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-        DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
+        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
-        REG("environ",   S_IRUSR, environ),
+        REG("environ",   S_IRUSR, proc_environ_operations),
-        INF("auxv",      S_IRUSR, pid_auxv),
+        INF("auxv",      S_IRUSR, proc_pid_auxv),
-        ONE("status",    S_IRUGO, pid_status),
+        ONE("status",    S_IRUGO, proc_pid_status),
-        ONE("personality", S_IRUSR, pid_personality),
+        ONE("personality", S_IRUSR, proc_pid_personality),
-        INF("limits",    S_IRUSR, pid_limits),
+        INF("limits",    S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-        REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
+        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-        INF("syscall",   S_IRUSR, pid_syscall),
+        INF("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
-        INF("cmdline",   S_IRUGO, pid_cmdline),
+        INF("cmdline",   S_IRUGO, proc_pid_cmdline),
-        ONE("stat",      S_IRUGO, tid_stat),
+        ONE("stat",      S_IRUGO, proc_tid_stat),
-        ONE("statm",     S_IRUGO, pid_statm),
+        ONE("statm",     S_IRUGO, proc_pid_statm),
-        REG("maps",      S_IRUGO, maps),
+        REG("maps",      S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-        REG("numa_maps", S_IRUGO, numa_maps),
+        REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
 #endif
-        REG("mem",       S_IRUSR|S_IWUSR, mem),
+        REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
-        LNK("cwd",       cwd),
+        LNK("cwd",       proc_cwd_link),
-        LNK("root",      root),
+        LNK("root",      proc_root_link),
-        LNK("exe",       exe),
+        LNK("exe",       proc_exe_link),
-        REG("mounts",    S_IRUGO, mounts),
+        REG("mounts",    S_IRUGO, proc_mounts_operations),
-        REG("mountinfo",  S_IRUGO, mountinfo),
+        REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-        REG("clear_refs", S_IWUSR, clear_refs),
+        REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
-        REG("smaps",     S_IRUGO, smaps),
+        REG("smaps",     S_IRUGO, proc_smaps_operations),
-        REG("pagemap",    S_IRUSR, pagemap),
+        REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-        DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
+        DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-        INF("wchan",     S_IRUGO, pid_wchan),
+        INF("wchan",     S_IRUGO, proc_pid_wchan),
+#endif
+#ifdef CONFIG_STACKTRACE
+        ONE("stack",      S_IRUSR, proc_pid_stack),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-        INF("schedstat", S_IRUGO, pid_schedstat),
+        INF("schedstat", S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-        REG("latency",  S_IRUGO, lstats),
+        REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-        REG("cpuset",    S_IRUGO, cpuset),
+        REG("cpuset",    S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-        REG("cgroup",  S_IRUGO, cgroup),
+        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-        INF("oom_score", S_IRUGO, oom_score),
+        INF("oom_score", S_IRUGO, proc_oom_score),
-        REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
+        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-        REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
+        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
-        REG("sessionid",  S_IRUSR, sessionid),
+        REG("sessionid",  S_IRUSR, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-        REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        INF("io",       S_IRUGO, tid_io_accounting),
+        INF("io",       S_IRUGO, proc_tid_io_accounting),
 #endif
 };
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 60a359b35582..db7fa5cab988 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -14,7 +14,6 @@
 #include <linux/stat.h>
 #include <linux/module.h>
 #include <linux/mount.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/idr.h>
 #include <linux/namei.h>
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
        struct inode *inode = NULL;
        int error = -ENOENT;
-        lock_kernel();
        spin_lock(&proc_subdir_lock);
        for (de = de->subdir; de ; de = de->next) {
                if (de->namelen != dentry->d_name.len)
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
        }
        spin_unlock(&proc_subdir_lock);
 out_unlock:
-        unlock_kernel();
        if (inode) {
                dentry->d_op = &proc_dentry_operations;
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
        struct inode *inode = filp->f_path.dentry->d_inode;
        int ret = 0;
-        lock_kernel();
        ino = inode->i_ino;
        i = filp->f_pos;
        switch (i) {
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
                        spin_unlock(&proc_subdir_lock);
        }
        ret = 1;
-out:    unlock_kernel();
+out:
        return ret;     
 }
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
 * the /proc directory.
 */
 static const struct file_operations proc_dir_operations = {
+        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
        .readdir                = proc_readdir,
 };
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c658..3e76bb9b3ad6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
 */
 void de_put(struct proc_dir_entry *de)
 {
-        lock_kernel();
        if (!atomic_read(&de->count)) {
                printk("de_put: entry %s already free!\n", de->name);
-                unlock_kernel();
                return;
        }
        if (atomic_dec_and_test(&de->count))
                free_proc_entry(de);
-        unlock_kernel();
 }
 /*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e8aeb8b61ce..cd53ff838498 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -41,8 +41,6 @@ do {						\
        (vmi)->used = 0;                        \
        (vmi)->largest_chunk = 0;               \
 } while(0)
-extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b1675c4e66da..43d23948384a 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                "LowTotal:       %8lu kB\n"
                "LowFree:        %8lu kB\n"
 #endif
+#ifndef CONFIG_MMU
+                "MmapCopy:       %8lu kB\n"
+#endif
                "SwapTotal:      %8lu kB\n"
                "SwapFree:       %8lu kB\n"
                "Dirty:          %8lu kB\n"
@@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
                K(i.totalram-i.totalhigh),
                K(i.freeram-i.freehigh),
 #endif
+#ifndef CONFIG_MMU
+                K((unsigned long) atomic_read(&mmap_pages_allocated)),
+#endif
                K(i.totalswap),
                K(i.freeswap),
                K(global_page_state(NR_FILE_DIRTY)),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3f87d2632947..b446d7ad0b0d 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,33 +33,33 @@
 #include "internal.h"
 /*
- * display a single VMA to a sequenced file
+ * display a single region to a sequenced file
 */
-int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 {
        unsigned long ino = 0;
        struct file *file;
        dev_t dev = 0;
        int flags, len;
-        flags = vma->vm_flags;
+        flags = region->vm_flags;
-        file = vma->vm_file;
+        file = region->vm_file;
        if (file) {
-                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+                struct inode *inode = region->vm_file->f_path.dentry->d_inode;
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
        }
        seq_printf(m,
                   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
-                   vma->vm_start,
+                   region->vm_start,
-                   vma->vm_end,
+                   region->vm_end,
                   flags & VM_READ ? 'r' : '-',
                   flags & VM_WRITE ? 'w' : '-',
                   flags & VM_EXEC ? 'x' : '-',
                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-                   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+                   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
                   MAJOR(dev), MINOR(dev), ino, &len);
        if (file) {
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 }
 /*
- * display a list of all the VMAs the kernel knows about
+ * display a list of all the REGIONs the kernel knows about
 * - nommu kernals have a single flat list
 */
-static int nommu_vma_list_show(struct seq_file *m, void *v)
+static int nommu_region_list_show(struct seq_file *m, void *_p)
 {
-        struct vm_area_struct *vma;
+        struct rb_node *p = _p;
-        vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
+        return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
-        return nommu_vma_show(m, vma);
 }
-static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos)
+static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
 {
-        struct rb_node *_rb;
+        struct rb_node *p;
        loff_t pos = *_pos;
-        void *next = NULL;
-        down_read(&nommu_vma_sem);
+        down_read(&nommu_region_sem);
-        for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) {
+        for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
-                if (pos == 0) {
+                if (pos-- == 0)
-                        next = _rb;
+                        return p;
-                        break;
+        return NULL;
-                }
-                pos--;
-        }
-        return next;
 }
-static void nommu_vma_list_stop(struct seq_file *m, void *v)
+static void nommu_region_list_stop(struct seq_file *m, void *v)
 {
-        up_read(&nommu_vma_sem);
+        up_read(&nommu_region_sem);
 }
-static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos)
+static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
 {
        (*pos)++;
        return rb_next((struct rb_node *) v);
 }
-static const struct seq_operations proc_nommu_vma_list_seqop = {
+static struct seq_operations proc_nommu_region_list_seqop = {
-        .start  = nommu_vma_list_start,
+        .start  = nommu_region_list_start,
-        .next   = nommu_vma_list_next,
+        .next   = nommu_region_list_next,
-        .stop   = nommu_vma_list_stop,
+        .stop   = nommu_region_list_stop,
-        .show   = nommu_vma_list_show
+        .show   = nommu_region_list_show
 };
-static int proc_nommu_vma_list_open(struct inode *inode, struct file *file)
+static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
 {
-        return seq_open(file, &proc_nommu_vma_list_seqop);
+        return seq_open(file, &proc_nommu_region_list_seqop);
 }
-static const struct file_operations proc_nommu_vma_list_operations = {
+static const struct file_operations proc_nommu_region_list_operations = {
-        .open    = proc_nommu_vma_list_open,
+        .open    = proc_nommu_region_list_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
        .release = seq_release,
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
 static int __init proc_nommu_init(void)
 {
-        proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations);
+        proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
        return 0;
 }
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7bc296f424ae..04d1270f1c38 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -18,7 +18,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
 }
 const struct file_operations proc_net_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = proc_tgid_net_readdir,
 };
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9e..94fcfff6863a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
        inode->i_mode = table->mode;
-        inode->i_uid = inode->i_gid = 0;
        if (!table->child) {
                inode->i_mode |= S_IFREG;
                inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7761602af9de..f6299a25594e 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,7 +16,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp,
        unsigned int nr = filp->f_pos;
        int ret;
-        lock_kernel();
        if (nr < FIRST_PROCESS_ENTRY) {
                int error = proc_readdir(filp, dirent, filldir);
-                if (error <= 0) {
+                if (error <= 0)
-                        unlock_kernel();
                        return error;
-                }
                filp->f_pos = FIRST_PROCESS_ENTRY;
        }
-        unlock_kernel();
        ret = proc_pid_readdir(filp, dirent, filldir);
        return ret;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81904f07679d..f75efa22df5e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,6 +9,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/irqnr.h>
 #include <asm/cputime.h>
 #ifndef arch_irq_stat_cpu
@@ -44,10 +45,9 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
+                for_each_irq_nr(j) {
-                for_each_irq_nr(j)
                        sum += kstat_irqs_cpu(j, i);
+                }
                sum += arch_irq_stat_cpu(i);
        }
        sum += arch_irq_stat();
@@ -92,7 +92,6 @@ static int show_stat(struct seq_file *p, void *v)
        /* sum again ? it could be updated? */
        for_each_irq_nr(j) {
                per_irq_sum = 0;
                for_each_possible_cpu(i)
                        per_irq_sum += kstat_irqs_cpu(j, i);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3a8bdd7f5756..94063840832a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v)
                   "Private_Clean:  %8lu kB\n"
                   "Private_Dirty:  %8lu kB\n"
                   "Referenced:     %8lu kB\n"
-                   "Swap:           %8lu kB\n",
+                   "Swap:           %8lu kB\n"
+                   "KernelPageSize: %8lu kB\n"
+                   "MMUPageSize:    %8lu kB\n",
                   (vma->vm_end - vma->vm_start) >> 10,
                   mss.resident >> 10,
                   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v)
                   mss.private_clean >> 10,
                   mss.private_dirty >> 10,
                   mss.referenced >> 10,
-                   mss.swap >> 10);
+                   mss.swap >> 10,
+                   vma_kernel_pagesize(vma) >> 10,
+                   vma_mmu_pagesize(vma) >> 10);
        if (m->count < m->size)  /* vma is copied successfully */
                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 219bd79ea894..343ea1216bc8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -9,31 +9,38 @@
 /*
 * Logic: we've got two memory sums for each process, "shared", and
- * "non-shared". Shared memory may get counted more then once, for
+ * "non-shared". Shared memory may get counted more than once, for
 * each process that owns it. Non-shared memory is counted
 * accurately.
 */
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-        struct vm_list_struct *vml;
+        struct vm_area_struct *vma;
-        unsigned long bytes = 0, sbytes = 0, slack = 0;
+        struct vm_region *region;
+        struct rb_node *p;
+        unsigned long bytes = 0, sbytes = 0, slack = 0, size;
        
        down_read(&mm->mmap_sem);
-        for (vml = mm->context.vmlist; vml; vml = vml->next) {
+        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
-                if (!vml->vma)
+                vma = rb_entry(p, struct vm_area_struct, vm_rb);
-                        continue;
+                bytes += kobjsize(vma);
+                region = vma->vm_region;
+                if (region) {
+                        size = kobjsize(region);
+                        size += region->vm_end - region->vm_start;
+                } else {
+                        size = vma->vm_end - vma->vm_start;
+                }
-                bytes += kobjsize(vml);
                if (atomic_read(&mm->mm_count) > 1 ||
-                    atomic_read(&vml->vma->vm_usage) > 1
+                    vma->vm_flags & VM_MAYSHARE) {
-                    ) {
+                        sbytes += size;
-                        sbytes += kobjsize((void *) vml->vma->vm_start);
-                        sbytes += kobjsize(vml->vma);
                } else {
-                        bytes += kobjsize((void *) vml->vma->vm_start);
+                        bytes += size;
-                        bytes += kobjsize(vml->vma);
+                        if (region)
-                        slack += kobjsize((void *) vml->vma->vm_start) -
+                                slack = region->vm_end - vma->vm_end;
-                                (vml->vma->vm_end - vml->vma->vm_start);
                }
        }
@@ -70,13 +77,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 unsigned long task_vsize(struct mm_struct *mm)
 {
-        struct vm_list_struct *tbp;
+        struct vm_area_struct *vma;
+        struct rb_node *p;
        unsigned long vsize = 0;
        down_read(&mm->mmap_sem);
-        for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
+        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
-                if (tbp->vma)
+                vma = rb_entry(p, struct vm_area_struct, vm_rb);
-                        vsize += kobjsize((void *) tbp->vma->vm_start);
+                vsize += vma->vm_end - vma->vm_start;
        }
        up_read(&mm->mmap_sem);
        return vsize;
@@ -85,15 +93,19 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
               int *data, int *resident)
 {
-        struct vm_list_struct *tbp;
+        struct vm_area_struct *vma;
+        struct vm_region *region;
+        struct rb_node *p;
        int size = kobjsize(mm);
        down_read(&mm->mmap_sem);
-        for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
+        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
-                size += kobjsize(tbp);
+                vma = rb_entry(p, struct vm_area_struct, vm_rb);
-                if (tbp->vma) {
+                size += kobjsize(vma);
-                        size += kobjsize(tbp->vma);
+                region = vma->vm_region;
-                        size += kobjsize((void *) tbp->vma->vm_start);
+                if (region) {
+                        size += kobjsize(region);
+                        size += region->vm_end - region->vm_start;
                }
        }
@@ -105,20 +117,62 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 }
 /*
+ * display a single VMA to a sequenced file
+ */
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+{
+        unsigned long ino = 0;
+        struct file *file;
+        dev_t dev = 0;
+        int flags, len;
+        flags = vma->vm_flags;
+        file = vma->vm_file;
+        if (file) {
+                struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+                dev = inode->i_sb->s_dev;
+                ino = inode->i_ino;
+        }
+        seq_printf(m,
+                   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+                   vma->vm_start,
+                   vma->vm_end,
+                   flags & VM_READ ? 'r' : '-',
+                   flags & VM_WRITE ? 'w' : '-',
+                   flags & VM_EXEC ? 'x' : '-',
+                   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+                   vma->vm_pgoff << PAGE_SHIFT,
+                   MAJOR(dev), MINOR(dev), ino, &len);
+        if (file) {
+                len = 25 + sizeof(void *) * 6 - len;
+                if (len < 1)
+                        len = 1;
+                seq_printf(m, "%*c", len, ' ');
+                seq_path(m, &file->f_path, "");
+        }
+        seq_putc(m, '\n');
+        return 0;
+}
+/*
 * display mapping lines for a particular process's /proc/pid/maps
 */
-static int show_map(struct seq_file *m, void *_vml)
+static int show_map(struct seq_file *m, void *_p)
 {
-        struct vm_list_struct *vml = _vml;
+        struct rb_node *p = _p;
-        return nommu_vma_show(m, vml->vma);
+        return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
 }
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
        struct proc_maps_private *priv = m->private;
-        struct vm_list_struct *vml;
        struct mm_struct *mm;
+        struct rb_node *p;
        loff_t n = *pos;
        /* pin the task and mm whilst we play with them */
@@ -134,9 +188,9 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        }
        /* start from the Nth VMA */
-        for (vml = mm->context.vmlist; vml; vml = vml->next)
+        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
                if (n-- == 0)
-                        return vml;
+                        return p;
        return NULL;
 }
@@ -152,12 +206,12 @@ static void m_stop(struct seq_file *m, void *_vml)
        }
 }
-static void *m_next(struct seq_file *m, void *_vml, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
 {
-        struct vm_list_struct *vml = _vml;
+        struct rb_node *p = _p;
        (*pos)++;
-        return vml ? vml->next : NULL;
+        return p ? rb_next(p) : NULL;
 }
 static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec59504906..5edcc3f92ba7 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
        offset = (unsigned long)(*ppos % PAGE_SIZE);
        pfn = (unsigned long)(*ppos / PAGE_SIZE);
-        if (pfn > saved_max_pfn)
-                return -EINVAL;
        do {
                if (count > (PAGE_SIZE - offset))
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e01618..4a8c94f05f76 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
                case Q_SETQUOTA:
                case Q_GETQUOTA:
                        /* This is just informative test so we are satisfied without a lock */
-                        if (!sb_has_quota_enabled(sb, type))
+                        if (!sb_has_quota_active(sb, type))
                                return -ESRCH;
        }
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
        int cnt;
        sb->s_qcop->quota_sync(sb, type);
+        if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
+                return;
        /* This is not very clever (and fast) but currently I don't know about
         * any other simple way of getting quota data to disk and we must get
         * them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
-                if (!sb_has_quota_enabled(sb, cnt))
+                if (!sb_has_quota_active(sb, cnt))
                        continue;
                mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
                truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
                for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                        if (type != -1 && type != cnt)
                                continue;
-                        if (!sb_has_quota_enabled(sb, cnt))
+                        if (!sb_has_quota_active(sb, cnt))
                                continue;
                        if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
                            list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
                        __u32 fmt;
                        down_read(&sb_dqopt(sb)->dqptr_sem);
-                        if (!sb_has_quota_enabled(sb, type)) {
+                        if (!sb_has_quota_active(sb, type)) {
                                up_read(&sb_dqopt(sb)->dqptr_sem);
                                return -ESRCH;
                        }
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 000000000000..953404c95b17
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
+/*
+ *      vfsv0 quota IO operations on file
+ */
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+#include <asm/byteorder.h>
+#include "quota_tree.h"
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+#define __QUOTA_QT_PARANOIA
+typedef char *dqbuf_t;
+static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
+{
+        unsigned int epb = info->dqi_usable_bs >> 2;
+        depth = info->dqi_qtree_depth - depth - 1;
+        while (depth--)
+                id /= epb;
+        return id % epb;
+}
+/* Number of entries in one blocks */
+static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+        return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+               / info->dqi_entry_size;
+}
+static dqbuf_t getdqbuf(size_t size)
+{
+        dqbuf_t buf = kmalloc(size, GFP_NOFS);
+        if (!buf)
+                printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+        return buf;
+}
+static inline void freedqbuf(dqbuf_t buf)
+{
+        kfree(buf);
+}
+static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+        struct super_block *sb = info->dqi_sb;
+        memset(buf, 0, info->dqi_usable_bs);
+        return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+        struct super_block *sb = info->dqi_sb;
+        return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+               info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        int ret, blk;
+        if (!buf)
+                return -ENOMEM;
+        if (info->dqi_free_blk) {
+                blk = info->dqi_free_blk;
+                ret = read_blk(info, blk, buf);
+                if (ret < 0)
+                        goto out_buf;
+                info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+        }
+        else {
+                memset(buf, 0, info->dqi_usable_bs);
+                /* Assure block allocation... */
+                ret = write_blk(info, info->dqi_blocks, buf);
+                if (ret < 0)
+                        goto out_buf;
+                blk = info->dqi_blocks++;
+        }
+        mark_info_dirty(info->dqi_sb, info->dqi_type);
+        ret = blk;
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        int err;
+        dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+        dh->dqdh_prev_free = cpu_to_le32(0);
+        dh->dqdh_entries = cpu_to_le16(0);
+        err = write_blk(info, blk, buf);
+        if (err < 0)
+                return err;
+        info->dqi_free_blk = blk;
+        mark_info_dirty(info->dqi_sb, info->dqi_type);
+        return 0;
+}
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+        dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+        uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+        int err;
+        if (!tmpbuf)
+                return -ENOMEM;
+        if (nextblk) {
+                err = read_blk(info, nextblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+                ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+                                                        dh->dqdh_prev_free;
+                err = write_blk(info, nextblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+        }
+        if (prevblk) {
+                err = read_blk(info, prevblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+                ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+                                                        dh->dqdh_next_free;
+                err = write_blk(info, prevblk, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+        } else {
+                info->dqi_free_entry = nextblk;
+                mark_info_dirty(info->dqi_sb, info->dqi_type);
+        }
+        freedqbuf(tmpbuf);
+        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+        /* No matter whether write succeeds block is out of list */
+        if (write_blk(info, blk, buf) < 0)
+                printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+        return 0;
+out_buf:
+        freedqbuf(tmpbuf);
+        return err;
+}
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+        dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+        struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+        int err;
+        if (!tmpbuf)
+                return -ENOMEM;
+        dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+        dh->dqdh_prev_free = cpu_to_le32(0);
+        err = write_blk(info, blk, buf);
+        if (err < 0)
+                goto out_buf;
+        if (info->dqi_free_entry) {
+                err = read_blk(info, info->dqi_free_entry, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+                ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+                                                        cpu_to_le32(blk);
+                err = write_blk(info, info->dqi_free_entry, tmpbuf);
+                if (err < 0)
+                        goto out_buf;
+        }
+        freedqbuf(tmpbuf);
+        info->dqi_free_entry = blk;
+        mark_info_dirty(info->dqi_sb, info->dqi_type);
+        return 0;
+out_buf:
+        freedqbuf(tmpbuf);
+        return err;
+}
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+        int i;
+        for (i = 0; i < info->dqi_entry_size; i++)
+                if (disk[i])
+                        return 0;
+        return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+                              struct dquot *dquot, int *err)
+{
+        uint blk, i;
+        struct qt_disk_dqdbheader *dh;
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        char *ddquot;
+        *err = 0;
+        if (!buf) {
+                *err = -ENOMEM;
+                return 0;
+        }
+        dh = (struct qt_disk_dqdbheader *)buf;
+        if (info->dqi_free_entry) {
+                blk = info->dqi_free_entry;
+                *err = read_blk(info, blk, buf);
+                if (*err < 0)
+                        goto out_buf;
+        } else {
+                blk = get_free_dqblk(info);
+                if ((int)blk < 0) {
+                        *err = blk;
+                        freedqbuf(buf);
+                        return 0;
+                }
+                memset(buf, 0, info->dqi_usable_bs);
+                /* This is enough as block is already zeroed and entry list is empty... */
+                info->dqi_free_entry = blk;
+                mark_info_dirty(dquot->dq_sb, dquot->dq_type);
+        }
+        /* Block will be full? */
+        if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+                *err = remove_free_dqentry(info, buf, blk);
+                if (*err < 0) {
+                        printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+                               "remove block (%u) from entry free list.\n",
+                               blk);
+                        goto out_buf;
+                }
+        }
+        le16_add_cpu(&dh->dqdh_entries, 1);
+        /* Find free structure in block */
+        for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+             i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
+             i++, ddquot += info->dqi_entry_size);
+#ifdef __QUOTA_QT_PARANOIA
+        if (i == qtree_dqstr_in_blk(info)) {
+                printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+                                "but it shouldn't.\n");
+                *err = -EIO;
+                goto out_buf;
+        }
+#endif
+        *err = write_blk(info, blk, buf);
+        if (*err < 0) {
+                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+                                "data block %u.\n", blk);
+                goto out_buf;
+        }
+        dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+                        sizeof(struct qt_disk_dqdbheader) +
+                        i * info->dqi_entry_size;
+        freedqbuf(buf);
+        return blk;
+out_buf:
+        freedqbuf(buf);
+        return 0;
+}
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+                          uint *treeblk, int depth)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        int ret = 0, newson = 0, newact = 0;
+        __le32 *ref;
+        uint newblk;
+        if (!buf)
+                return -ENOMEM;
+        if (!*treeblk) {
+                ret = get_free_dqblk(info);
+                if (ret < 0)
+                        goto out_buf;
+                *treeblk = ret;
+                memset(buf, 0, info->dqi_usable_bs);
+                newact = 1;
+        } else {
+                ret = read_blk(info, *treeblk, buf);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: Can't read tree quota block "
+                                        "%u.\n", *treeblk);
+                        goto out_buf;
+                }
+        }
+        ref = (__le32 *)buf;
+        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+        if (!newblk)
+                newson = 1;
+        if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+                if (newblk) {
+                        printk(KERN_ERR "VFS: Inserting already present quota "
+                                        "entry (block %u).\n",
+                               le32_to_cpu(ref[get_index(info,
+                                                dquot->dq_id, depth)]));
+                        ret = -EIO;
+                        goto out_buf;
+                }
+#endif
+                newblk = find_free_dqentry(info, dquot, &ret);
+        } else {
+                ret = do_insert_tree(info, dquot, &newblk, depth+1);
+        }
+        if (newson && ret >= 0) {
+                ref[get_index(info, dquot->dq_id, depth)] =
+                                                        cpu_to_le32(newblk);
+                ret = write_blk(info, *treeblk, buf);
+        } else if (newact && ret < 0) {
+                put_free_dqblk(info, buf, *treeblk);
+        }
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+                                 struct dquot *dquot)
+{
+        int tmp = QT_TREEOFF;
+        return do_insert_tree(info, dquot, &tmp, 0);
+}
+/*
+ *      We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        int type = dquot->dq_type;
+        struct super_block *sb = dquot->dq_sb;
+        ssize_t ret;
+        dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+        if (!ddquot)
+                return -ENOMEM;
+        /* dq_off is guarded by dqio_mutex */
+        if (!dquot->dq_off) {
+                ret = dq_insert_tree(info, dquot);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: Error %zd occurred while "
+                                        "creating quota.\n", ret);
+                        freedqbuf(ddquot);
+                        return ret;
+                }
+        }
+        spin_lock(&dq_data_lock);
+        info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+        spin_unlock(&dq_data_lock);
+        ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
+                                        info->dqi_entry_size, dquot->dq_off);
+        if (ret != info->dqi_entry_size) {
+                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+                       sb->s_id);
+                if (ret >= 0)
+                        ret = -ENOSPC;
+        } else {
+                ret = 0;
+        }
+        dqstats.writes++;
+        freedqbuf(ddquot);
+        return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+                        uint blk)
+{
+        struct qt_disk_dqdbheader *dh;
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        int ret = 0;
+        if (!buf)
+                return -ENOMEM;
+        if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+                printk(KERN_ERR "VFS: Quota structure has offset to other "
+                  "block (%u) than it should (%u).\n", blk,
+                  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+                goto out_buf;
+        }
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+                goto out_buf;
+        }
+        dh = (struct qt_disk_dqdbheader *)buf;
+        le16_add_cpu(&dh->dqdh_entries, -1);
+        if (!le16_to_cpu(dh->dqdh_entries)) {   /* Block got free? */
+                ret = remove_free_dqentry(info, buf, blk);
+                if (ret >= 0)
+                        ret = put_free_dqblk(info, buf, blk);
+                if (ret < 0) {
+                        printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+                          "to free list.\n", blk);
+                        goto out_buf;
+                }
+        } else {
+                memset(buf +
+                       (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+                       0, info->dqi_entry_size);
+                if (le16_to_cpu(dh->dqdh_entries) ==
+                    qtree_dqstr_in_blk(info) - 1) {
+                        /* Insert will write block itself */
+                        ret = insert_free_dqentry(info, buf, blk);
+                        if (ret < 0) {
+                                printk(KERN_ERR "VFS: Can't insert quota data "
+                                       "block (%u) to free entry list.\n", blk);
+                                goto out_buf;
+                        }
+                } else {
+                        ret = write_blk(info, blk, buf);
+                        if (ret < 0) {
+                                printk(KERN_ERR "VFS: Can't write quota data "
+                                  "block %u\n", blk);
+                                goto out_buf;
+                        }
+                }
+        }
+        dquot->dq_off = 0;      /* Quota is now unattached */
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+                       uint *blk, int depth)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        int ret = 0;
+        uint newblk;
+        __le32 *ref = (__le32 *)buf;
+        if (!buf)
+                return -ENOMEM;
+        ret = read_blk(info, *blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+                goto out_buf;
+        }
+        newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+        if (depth == info->dqi_qtree_depth - 1) {
+                ret = free_dqentry(info, dquot, newblk);
+                newblk = 0;
+        } else {
+                ret = remove_tree(info, dquot, &newblk, depth+1);
+        }
+        if (ret >= 0 && !newblk) {
+                int i;
+                ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+                /* Block got empty? */
+                for (i = 0;
+                     i < (info->dqi_usable_bs >> 2) && !ref[i];
+                     i++);
+                /* Don't put the root block into the free block list */
+                if (i == (info->dqi_usable_bs >> 2)
+                    && *blk != QT_TREEOFF) {
+                        put_free_dqblk(info, buf, *blk);
+                        *blk = 0;
+                } else {
+                        ret = write_blk(info, *blk, buf);
+                        if (ret < 0)
+                                printk(KERN_ERR "VFS: Can't write quota tree "
+                                  "block %u.\n", *blk);
+                }
+        }
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        uint tmp = QT_TREEOFF;
+        if (!dquot->dq_off)     /* Even not allocated? */
+                return 0;
+        return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+                                 struct dquot *dquot, uint blk)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        loff_t ret = 0;
+        int i;
+        char *ddquot;
+        if (!buf)
+                return -ENOMEM;
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                goto out_buf;
+        }
+        for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+             i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
+             i++, ddquot += info->dqi_entry_size);
+        if (i == qtree_dqstr_in_blk(info)) {
+                printk(KERN_ERR "VFS: Quota for id %u referenced "
+                  "but not present.\n", dquot->dq_id);
+                ret = -EIO;
+                goto out_buf;
+        } else {
+                ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+                  qt_disk_dqdbheader) + i * info->dqi_entry_size;
+        }
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+                                struct dquot *dquot, uint blk, int depth)
+{
+        dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+        loff_t ret = 0;
+        __le32 *ref = (__le32 *)buf;
+        if (!buf)
+                return -ENOMEM;
+        ret = read_blk(info, blk, buf);
+        if (ret < 0) {
+                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+                goto out_buf;
+        }
+        ret = 0;
+        blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+        if (!blk)       /* No reference? */
+                goto out_buf;
+        if (depth < info->dqi_qtree_depth - 1)
+                ret = find_tree_dqentry(info, dquot, blk, depth+1);
+        else
+                ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+        freedqbuf(buf);
+        return ret;
+}
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+                                  struct dquot *dquot)
+{
+        return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        int type = dquot->dq_type;
+        struct super_block *sb = dquot->dq_sb;
+        loff_t offset;
+        dqbuf_t ddquot;
+        int ret = 0;
+#ifdef __QUOTA_QT_PARANOIA
+        /* Invalidated quota? */
+        if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+                printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+                return -EIO;
+        }
+#endif
+        /* Do we know offset of the dquot entry in the quota file? */
+        if (!dquot->dq_off) {
+                offset = find_dqentry(info, dquot);
+                if (offset <= 0) {      /* Entry not present? */
+                        if (offset < 0)
+                                printk(KERN_ERR "VFS: Can't read quota "
+                                  "structure for id %u.\n", dquot->dq_id);
+                        dquot->dq_off = 0;
+                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
+                        memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+                        ret = offset;
+                        goto out;
+                }
+                dquot->dq_off = offset;
+        }
+        ddquot = getdqbuf(info->dqi_entry_size);
+        if (!ddquot)
+                return -ENOMEM;
+        ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
+                                   info->dqi_entry_size, dquot->dq_off);
+        if (ret != info->dqi_entry_size) {
+                if (ret >= 0)
+                        ret = -EIO;
+                printk(KERN_ERR "VFS: Error while reading quota "
+                                "structure for id %u.\n", dquot->dq_id);
+                set_bit(DQ_FAKE_B, &dquot->dq_flags);
+                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+                freedqbuf(ddquot);
+                goto out;
+        }
+        spin_lock(&dq_data_lock);
+        info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+        if (!dquot->dq_dqb.dqb_bhardlimit &&
+            !dquot->dq_dqb.dqb_bsoftlimit &&
+            !dquot->dq_dqb.dqb_ihardlimit &&
+            !dquot->dq_dqb.dqb_isoftlimit)
+                set_bit(DQ_FAKE_B, &dquot->dq_flags);
+        spin_unlock(&dq_data_lock);
+        freedqbuf(ddquot);
+out:
+        dqstats.reads++;
+        return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+                return qtree_delete_dquot(info, dquot);
+        return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 000000000000..a1ab8db81a51
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
+/*
+ *      Definitions of structures for vfsv0 quota format
+ */
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+#include <linux/types.h>
+#include <linux/quota.h>
+/*
+ *  Structure of header of block with quota structures. It is padded to 16 bytes so
+ *  there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+        __le32 dqdh_next_free;  /* Number of next block with free entry */
+        __le32 dqdh_prev_free;  /* Number of previous block with free entry */
+        __le16 dqdh_entries;    /* Number of valid entries in block */
+        __le16 dqdh_pad1;
+        __le32 dqdh_pad2;
+};
+#define QT_TREEOFF      1               /* Offset of tree in file in blocks */
+#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb0..b4af1c69ad16 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,25 +3,39 @@
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_v1.h>
-#include <linux/quotaio_v1.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <asm/byteorder.h>
+#include "quotaio_v1.h"
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Old quota format support");
 MODULE_LICENSE("GPL");
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+        return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+        return blocks << QUOTABLOCK_BITS;
+}
 static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
 {
        m->dqb_ihardlimit = d->dqb_ihardlimit;
        m->dqb_isoftlimit = d->dqb_isoftlimit;
        m->dqb_curinodes = d->dqb_curinodes;
-        m->dqb_bhardlimit = d->dqb_bhardlimit;
+        m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
-        m->dqb_bsoftlimit = d->dqb_bsoftlimit;
+        m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
-        m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS;
+        m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
        m->dqb_itime = d->dqb_itime;
        m->dqb_btime = d->dqb_btime;
 }
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
        d->dqb_ihardlimit = m->dqb_ihardlimit;
        d->dqb_isoftlimit = m->dqb_isoftlimit;
        d->dqb_curinodes = m->dqb_curinodes;
-        d->dqb_bhardlimit = m->dqb_bhardlimit;
+        d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
-        d->dqb_bsoftlimit = m->dqb_bsoftlimit;
+        d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
-        d->dqb_curblocks = toqb(m->dqb_curspace);
+        d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
        d->dqb_itime = m->dqb_itime;
        d->dqb_btime = m->dqb_btime;
 }
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d9..b618b563635c 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/dqblk_v2.h>
-#include <linux/quotaio_v2.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -15,16 +14,37 @@
 #include <asm/byteorder.h>
+#include "quota_tree.h"
+#include "quotaio_v2.h"
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Quota format v2 support");
 MODULE_LICENSE("GPL");
 #define __QUOTA_V2_PARANOIA
-typedef char *dqbuf_t;
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2_is_id(void *dp, struct dquot *dquot);
+static struct qtree_fmt_operations v2_qtree_ops = {
+        .mem2disk_dqblk = v2_mem2diskdqb,
+        .disk2mem_dqblk = v2_disk2memdqb,
+        .is_id = v2_is_id,
+};
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
-#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
+static inline qsize_t v2_stoqb(qsize_t space)
-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
+{
+        return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+        return blocks << QUOTABLOCK_BITS;
+}
 /* Check whether given file is really vfsv0 quotafile */
 static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 static int v2_read_file_info(struct super_block *sb, int type)
 {
        struct v2_disk_dqinfo dinfo;
-        struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct qtree_mem_dqinfo *qinfo;
        ssize_t size;
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
                        sb->s_id);
                return -1;
        }
+        info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+        if (!info->dqi_priv) {
+                printk(KERN_WARNING
+                       "Not enough memory for quota information structure.\n");
+                return -1;
+        }
+        qinfo = info->dqi_priv;
        /* limits are stored as unsigned 32-bit data */
        info->dqi_maxblimit = 0xffffffff;
        info->dqi_maxilimit = 0xffffffff;
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
        info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
-        info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        qinfo->dqi_sb = sb;
-        info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        qinfo->dqi_type = type;
-        info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+        qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+        qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+        qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+        qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+        qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+        qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+        qinfo->dqi_ops = &v2_qtree_ops;
        return 0;
 }
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
 static int v2_write_file_info(struct super_block *sb, int type)
 {
        struct v2_disk_dqinfo dinfo;
-        struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
        ssize_t size;
        spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
        dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
        spin_unlock(&dq_data_lock);
-        dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks);
+        dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
-        dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk);
+        dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
-        dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+        dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
        size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
        if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
        return 0;
 }
-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
+static void v2_disk2memdqb(struct dquot *dquot, void *dp)
 {
+        struct v2_disk_dqblk *d = dp, empty;
+        struct mem_dqblk *m = &dquot->dq_dqb;
        m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
        m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
        m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
        m->dqb_itime = le64_to_cpu(d->dqb_itime);
-        m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
+        m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
-        m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+        m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
        m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
        m->dqb_btime = le64_to_cpu(d->dqb_btime);
+        /* We need to escape back all-zero structure */
+        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+        empty.dqb_itime = cpu_to_le64(1);
+        if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+                m->dqb_itime = 0;
 }
-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
 {
+        struct v2_disk_dqblk *d = dp;
+        struct mem_dqblk *m = &dquot->dq_dqb;
+        struct qtree_mem_dqinfo *info =
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
        d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
        d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
        d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
        d->dqb_itime = cpu_to_le64(m->dqb_itime);
-        d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
+        d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
-        d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+        d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
        d->dqb_btime = cpu_to_le64(m->dqb_btime);
-        d->dqb_id = cpu_to_le32(id);
+        d->dqb_id = cpu_to_le32(dquot->dq_id);
-}
+        if (qtree_entry_unused(info, dp))
+                d->dqb_itime = cpu_to_le64(1);
-static dqbuf_t getdqbuf(void)
-{
-        dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
-        if (!buf)
-                printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
-        return buf;
-}
-static inline void freedqbuf(dqbuf_t buf)
-{
-        kfree(buf);
-}
-static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-        memset(buf, 0, V2_DQBLKSIZE);
-        return sb->s_op->quota_read(sb, type, (char *)buf,
-               V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-        return sb->s_op->quota_write(sb, type, (char *)buf,
-               V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-/* Remove empty block from list and return it */
-static int get_free_dqblk(struct super_block *sb, int type)
-{
-        dqbuf_t buf = getdqbuf();
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-        int ret, blk;
-        if (!buf)
-                return -ENOMEM;
-        if (info->u.v2_i.dqi_free_blk) {
-                blk = info->u.v2_i.dqi_free_blk;
-                if ((ret = read_blk(sb, type, blk, buf)) < 0)
-                        goto out_buf;
-                info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
-        }
-        else {
-                memset(buf, 0, V2_DQBLKSIZE);
-                /* Assure block allocation... */
-                if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
-                        goto out_buf;
-                blk = info->u.v2_i.dqi_blocks++;
-        }
-        mark_info_dirty(sb, type);
-        ret = blk;
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Insert empty block to the list */
-static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-        int err;
-        dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
-        dh->dqdh_prev_free = cpu_to_le32(0);
-        dh->dqdh_entries = cpu_to_le16(0);
-        info->u.v2_i.dqi_free_blk = blk;
-        mark_info_dirty(sb, type);
-        /* Some strange block. We had better leave it... */
-        if ((err = write_blk(sb, type, blk, buf)) < 0)
-                return err;
-        return 0;
 }
-/* Remove given block from the list of blocks with free entries */
+static int v2_is_id(void *dp, struct dquot *dquot)
-static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
 {
-        dqbuf_t tmpbuf = getdqbuf();
+        struct v2_disk_dqblk *d = dp;
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
+        struct qtree_mem_dqinfo *info =
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
+                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
-        uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
-        int err;
-        if (!tmpbuf)
+        if (qtree_entry_unused(info, dp))
-                return -ENOMEM;
-        if (nextblk) {
-                if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
-                        goto out_buf;
-                ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
-                if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
-                        goto out_buf;
-        }
-        if (prevblk) {
-                if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
-                        goto out_buf;
-                ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
-                if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
-                        goto out_buf;
-        }
-        else {
-                info->u.v2_i.dqi_free_entry = nextblk;
-                mark_info_dirty(sb, type);
-        }
-        freedqbuf(tmpbuf);
-        dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
-        /* No matter whether write succeeds block is out of list */
-        if (write_blk(sb, type, blk, buf) < 0)
-                printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
-        return 0;
-out_buf:
-        freedqbuf(tmpbuf);
-        return err;
-}
-/* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-        dqbuf_t tmpbuf = getdqbuf();
-        struct mem_dqinfo *info = sb_dqinfo(sb, type);
-        struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-        int err;
-        if (!tmpbuf)
-                return -ENOMEM;
-        dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
-        dh->dqdh_prev_free = cpu_to_le32(0);
-        if ((err = write_blk(sb, type, blk, buf)) < 0)
-                goto out_buf;
-        if (info->u.v2_i.dqi_free_entry) {
-                if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-                        goto out_buf;
-                ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
-                if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-                        goto out_buf;
-        }
-        freedqbuf(tmpbuf);
-        info->u.v2_i.dqi_free_entry = blk;
-        mark_info_dirty(sb, type);
-        return 0;
-out_buf:
-        freedqbuf(tmpbuf);
-        return err;
-}
-/* Find space for dquot */
-static uint find_free_dqentry(struct dquot *dquot, int *err)
-{
-        struct super_block *sb = dquot->dq_sb;
-        struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
-        uint blk, i;
-        struct v2_disk_dqdbheader *dh;
-        struct v2_disk_dqblk *ddquot;
-        struct v2_disk_dqblk fakedquot;
-        dqbuf_t buf;
-        *err = 0;
-        if (!(buf = getdqbuf())) {
-                *err = -ENOMEM;
                return 0;
-        }
+        return le32_to_cpu(d->dqb_id) == dquot->dq_id;
-        dh = (struct v2_disk_dqdbheader *)buf;
-        ddquot = GETENTRIES(buf);
-        if (info->u.v2_i.dqi_free_entry) {
-                blk = info->u.v2_i.dqi_free_entry;
-                if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
-                        goto out_buf;
-        }
-        else {
-                blk = get_free_dqblk(sb, dquot->dq_type);
-                if ((int)blk < 0) {
-                        *err = blk;
-                        freedqbuf(buf);
-                        return 0;
-                }
-                memset(buf, 0, V2_DQBLKSIZE);
-                /* This is enough as block is already zeroed and entry list is empty... */
-                info->u.v2_i.dqi_free_entry = blk;
-                mark_info_dirty(sb, dquot->dq_type);
-        }
-        if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK)   /* Block will be full? */
-                if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
-                        printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
-                        goto out_buf;
-                }
-        le16_add_cpu(&dh->dqdh_entries, 1);
-        memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-        /* Find free structure in block */
-        for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
-#ifdef __QUOTA_V2_PARANOIA
-        if (i == V2_DQSTRINBLK) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
-                *err = -EIO;
-                goto out_buf;
-        }
-#endif
-        if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
-                goto out_buf;
-        }
-        dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
-        freedqbuf(buf);
-        return blk;
-out_buf:
-        freedqbuf(buf);
-        return 0;
-}
-/* Insert reference to structure into the trie */
-static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
-{
-        struct super_block *sb = dquot->dq_sb;
-        dqbuf_t buf;
-        int ret = 0, newson = 0, newact = 0;
-        __le32 *ref;
-        uint newblk;
-        if (!(buf = getdqbuf()))
-                return -ENOMEM;
-        if (!*treeblk) {
-                ret = get_free_dqblk(sb, dquot->dq_type);
-                if (ret < 0)
-                        goto out_buf;
-                *treeblk = ret;
-                memset(buf, 0, V2_DQBLKSIZE);
-                newact = 1;
-        }
-        else {
-                if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
-                        printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
-                        goto out_buf;
-                }
-        }
-        ref = (__le32 *)buf;
-        newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-        if (!newblk)
-                newson = 1;
-        if (depth == V2_DQTREEDEPTH-1) {
-#ifdef __QUOTA_V2_PARANOIA
-                if (newblk) {
-                        printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
-                        ret = -EIO;
-                        goto out_buf;
-                }
-#endif
-                newblk = find_free_dqentry(dquot, &ret);
-        }
-        else
-                ret = do_insert_tree(dquot, &newblk, depth+1);
-        if (newson && ret >= 0) {
-                ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
-                ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
-        }
-        else if (newact && ret < 0)
-                put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
-out_buf:
-        freedqbuf(buf);
-        return ret;
 }
-/* Wrapper for inserting quota structure into tree */
+static int v2_read_dquot(struct dquot *dquot)
-static inline int dq_insert_tree(struct dquot *dquot)
 {
-        int tmp = V2_DQTREEOFF;
+        return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
-        return do_insert_tree(dquot, &tmp, 0);
 }
-/*
- *      We don't have to be afraid of deadlocks as we never have quotas on quota files...
- */
 static int v2_write_dquot(struct dquot *dquot)
 {
-        int type = dquot->dq_type;
+        return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
-        ssize_t ret;
-        struct v2_disk_dqblk ddquot, empty;
-        /* dq_off is guarded by dqio_mutex */
-        if (!dquot->dq_off)
-                if ((ret = dq_insert_tree(dquot)) < 0) {
-                        printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
-                        return ret;
-                }
-        spin_lock(&dq_data_lock);
-        mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
-        /* Argh... We may need to write structure full of zeroes but that would be
-         * treated as an empty place by the rest of the code. Format change would
-         * be definitely cleaner but the problems probably are not worth it */
-        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-        if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-                ddquot.dqb_itime = cpu_to_le64(1);
-        spin_unlock(&dq_data_lock);
-        ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
-              (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
-        if (ret != sizeof(struct v2_disk_dqblk)) {
-                printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
-                if (ret >= 0)
-                        ret = -ENOSPC;
-        }
-        else
-                ret = 0;
-        dqstats.writes++;
-        return ret;
 }
-/* Free dquot entry in data block */
+static int v2_release_dquot(struct dquot *dquot)
-static int free_dqentry(struct dquot *dquot, uint blk)
-{
-        struct super_block *sb = dquot->dq_sb;
-        int type = dquot->dq_type;
-        struct v2_disk_dqdbheader *dh;
-        dqbuf_t buf = getdqbuf();
-        int ret = 0;
-        if (!buf)
-                return -ENOMEM;
-        if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
-                printk(KERN_ERR "VFS: Quota structure has offset to other "
-                  "block (%u) than it should (%u).\n", blk,
-                  (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
-                goto out_buf;
-        }
-        if ((ret = read_blk(sb, type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
-                goto out_buf;
-        }
-        dh = (struct v2_disk_dqdbheader *)buf;
-        le16_add_cpu(&dh->dqdh_entries, -1);
-        if (!le16_to_cpu(dh->dqdh_entries)) {   /* Block got free? */
-                if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
-                    (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
-                        printk(KERN_ERR "VFS: Can't move quota data block (%u) "
-                          "to free list.\n", blk);
-                        goto out_buf;
-                }
-        }
-        else {
-                memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
-                  sizeof(struct v2_disk_dqblk));
-                if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
-                        /* Insert will write block itself */
-                        if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
-                                printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
-                                goto out_buf;
-                        }
-                }
-                else
-                        if ((ret = write_blk(sb, type, blk, buf)) < 0) {
-                                printk(KERN_ERR "VFS: Can't write quota data "
-                                  "block %u\n", blk);
-                                goto out_buf;
-                        }
-        }
-        dquot->dq_off = 0;      /* Quota is now unattached */
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Remove reference to dquot from tree */
-static int remove_tree(struct dquot *dquot, uint *blk, int depth)
-{
-        struct super_block *sb = dquot->dq_sb;
-        int type = dquot->dq_type;
-        dqbuf_t buf = getdqbuf();
-        int ret = 0;
-        uint newblk;
-        __le32 *ref = (__le32 *)buf;
-        
-        if (!buf)
-                return -ENOMEM;
-        if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
-                goto out_buf;
-        }
-        newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-        if (depth == V2_DQTREEDEPTH-1) {
-                ret = free_dqentry(dquot, newblk);
-                newblk = 0;
-        }
-        else
-                ret = remove_tree(dquot, &newblk, depth+1);
-        if (ret >= 0 && !newblk) {
-                int i;
-                ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
-                for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++);  /* Block got empty? */
-                /* Don't put the root block into the free block list */
-                if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
-                        put_free_dqblk(sb, type, buf, *blk);
-                        *blk = 0;
-                }
-                else
-                        if ((ret = write_blk(sb, type, *blk, buf)) < 0)
-                                printk(KERN_ERR "VFS: Can't write quota tree "
-                                  "block %u.\n", *blk);
-        }
-out_buf:
-        freedqbuf(buf);
-        return ret;     
-}
-/* Delete dquot from tree */
-static int v2_delete_dquot(struct dquot *dquot)
-{
-        uint tmp = V2_DQTREEOFF;
-        if (!dquot->dq_off)     /* Even not allocated? */
-                return 0;
-        return remove_tree(dquot, &tmp, 0);
-}
-/* Find entry in block */
-static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
-{
-        dqbuf_t buf = getdqbuf();
-        loff_t ret = 0;
-        int i;
-        struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
-        if (!buf)
-                return -ENOMEM;
-        if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-                goto out_buf;
-        }
-        if (dquot->dq_id)
-                for (i = 0; i < V2_DQSTRINBLK &&
-                     le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
-        else {  /* ID 0 as a bit more complicated searching... */
-                struct v2_disk_dqblk fakedquot;
-                memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-                for (i = 0; i < V2_DQSTRINBLK; i++)
-                        if (!le32_to_cpu(ddquot[i].dqb_id) &&
-                            memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
-                                break;
-        }
-        if (i == V2_DQSTRINBLK) {
-                printk(KERN_ERR "VFS: Quota for id %u referenced "
-                  "but not present.\n", dquot->dq_id);
-                ret = -EIO;
-                goto out_buf;
-        }
-        else
-                ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
-                  v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Find entry for given id in the tree */
-static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
-{
-        dqbuf_t buf = getdqbuf();
-        loff_t ret = 0;
-        __le32 *ref = (__le32 *)buf;
-        if (!buf)
-                return -ENOMEM;
-        if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-                printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-                goto out_buf;
-        }
-        ret = 0;
-        blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-        if (!blk)       /* No reference? */
-                goto out_buf;
-        if (depth < V2_DQTREEDEPTH-1)
-                ret = find_tree_dqentry(dquot, blk, depth+1);
-        else
-                ret = find_block_dqentry(dquot, blk);
-out_buf:
-        freedqbuf(buf);
-        return ret;
-}
-/* Find entry for given id in the tree - wrapper function */
-static inline loff_t find_dqentry(struct dquot *dquot)
-{
-        return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
-}
-static int v2_read_dquot(struct dquot *dquot)
 {
-        int type = dquot->dq_type;
+        return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
-        loff_t offset;
-        struct v2_disk_dqblk ddquot, empty;
-        int ret = 0;
-#ifdef __QUOTA_V2_PARANOIA
-        /* Invalidated quota? */
-        if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
-                printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
-                return -EIO;
-        }
-#endif
-        offset = find_dqentry(dquot);
-        if (offset <= 0) {      /* Entry not present? */
-                if (offset < 0)
-                        printk(KERN_ERR "VFS: Can't read quota "
-                          "structure for id %u.\n", dquot->dq_id);
-                dquot->dq_off = 0;
-                set_bit(DQ_FAKE_B, &dquot->dq_flags);
-                memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
-                ret = offset;
-        }
-        else {
-                dquot->dq_off = offset;
-                if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
-                    (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
-                    != sizeof(struct v2_disk_dqblk)) {
-                        if (ret >= 0)
-                                ret = -EIO;
-                        printk(KERN_ERR "VFS: Error while reading quota "
-                          "structure for id %u.\n", dquot->dq_id);
-                        memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
-                }
-                else {
-                        ret = 0;
-                        /* We need to escape back all-zero structure */
-                        memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-                        empty.dqb_itime = cpu_to_le64(1);
-                        if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-                                ddquot.dqb_itime = 0;
-                }
-                disk2memdqb(&dquot->dq_dqb, &ddquot);
-                if (!dquot->dq_dqb.dqb_bhardlimit &&
-                        !dquot->dq_dqb.dqb_bsoftlimit &&
-                        !dquot->dq_dqb.dqb_ihardlimit &&
-                        !dquot->dq_dqb.dqb_isoftlimit)
-                        set_bit(DQ_FAKE_B, &dquot->dq_flags);
-        }
-        dqstats.reads++;
-        return ret;
 }
-/* Check whether dquot should not be deleted. We know we are
+static int v2_free_file_info(struct super_block *sb, int type)
- * the only one operating on dquot (thanks to dq_lock) */
-static int v2_release_dquot(struct dquot *dquot)
 {
-        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+        kfree(sb_dqinfo(sb, type)->dqi_priv);
-                return v2_delete_dquot(dquot);
        return 0;
 }
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
        .check_quota_file       = v2_check_quota_file,
        .read_file_info         = v2_read_file_info,
        .write_file_info        = v2_write_file_info,
-        .free_file_info         = NULL,
+        .free_file_info         = v2_free_file_info,
        .read_dqblk             = v2_read_dquot,
        .commit_dqblk           = v2_write_dquot,
        .release_dqblk          = v2_release_dquot,
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 000000000000..746654b5de70
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_QUOTAIO_V1_H
+#define _LINUX_QUOTAIO_V1_H
+#include <linux/types.h>
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME  604800     /* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME  604800     /* (7*24*60*60) 1 week */
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is an array of these structures
+ * indexed by user or group number.
+ */
+struct v1_disk_dqblk {
+        __u32 dqb_bhardlimit;   /* absolute limit on disk blks alloc */
+        __u32 dqb_bsoftlimit;   /* preferred limit on disk blks */
+        __u32 dqb_curblocks;    /* current block count */
+        __u32 dqb_ihardlimit;   /* absolute limit on allocated inodes */
+        __u32 dqb_isoftlimit;   /* preferred inode limit */
+        __u32 dqb_curinodes;    /* current # allocated inodes */
+        time_t dqb_btime;       /* time limit for excessive disk use */
+        time_t dqb_itime;       /* time limit for excessive inode use */
+};
+#define v1_dqoff(UID)      ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
+#endif  /* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 000000000000..530fe580685c
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,60 @@
+/*
+ *      Definitions of structures for vfsv0 quota format
+ */
+#ifndef _LINUX_QUOTAIO_V2_H
+#define _LINUX_QUOTAIO_V2_H
+#include <linux/types.h>
+#include <linux/quota.h>
+/*
+ * Definitions of magics and versions of current quota files
+ */
+#define V2_INITQMAGICS {\
+        0xd9c01f11,     /* USRQUOTA */\
+        0xd9c01927      /* GRPQUOTA */\
+}
+#define V2_INITQVERSIONS {\
+        0,              /* USRQUOTA */\
+        0               /* GRPQUOTA */\
+}
+/* First generic header */
+struct v2_disk_dqheader {
+        __le32 dqh_magic;       /* Magic number identifying file */
+        __le32 dqh_version;     /* File version */
+};
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is a radix tree whose leaves point
+ * to blocks of these structures.
+ */
+struct v2_disk_dqblk {
+        __le32 dqb_id;          /* id this quota applies to */
+        __le32 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+        __le32 dqb_isoftlimit;  /* preferred inode limit */
+        __le32 dqb_curinodes;   /* current # allocated inodes */
+        __le32 dqb_bhardlimit;  /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+        __le32 dqb_bsoftlimit;  /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+        __le64 dqb_curspace;    /* current space occupied (in bytes) */
+        __le64 dqb_btime;       /* time limit for excessive disk use */
+        __le64 dqb_itime;       /* time limit for excessive inode use */
+};
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+        __le32 dqi_bgrace;      /* Time before block soft limit becomes hard limit */
+        __le32 dqi_igrace;      /* Time before inode soft limit becomes hard limit */
+        __le32 dqi_flags;       /* Flags for quotafile (DQF_*) */
+        __le32 dqi_blocks;      /* Number of blocks in file */
+        __le32 dqi_free_blk;    /* Number of first free block in the list */
+        __le32 dqi_free_entry;  /* Number of block with at least one free entry */
+};
+#define V2_DQINFOOFF    sizeof(struct v2_disk_dqheader) /* Offset of info header in file */
+#define V2_DQBLKSIZE_BITS 10                            /* Size of leaf block in tree */
+#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 76acdbc34611..b9b567a28376 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
        ret = -ENOMEM;
        pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
        if (!pages)
-                goto out;
+                goto out_free;
        nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
        if (nr != lpages)
-                goto out; /* leave if some pages were missing */
+                goto out_free_pages; /* leave if some pages were missing */
        /* check the pages for physical adjacency */
        ptr = pages;
@@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
        page++;
        for (loop = lpages; loop > 1; loop--)
                if (*ptr++ != page++)
-                        goto out;
+                        goto out_free_pages;
        /* okay - all conditions fulfilled */
        ret = (unsigned long) page_address(pages[0]);
- out:
+out_free_pages:
-        if (pages) {
+        ptr = pages;
-                ptr = pages;
+        for (loop = nr; loop > 0; loop--)
-                for (loop = lpages; loop > 0; loop--)
+                put_page(*ptr++);
-                        put_page(*ptr++);
+out_free:
-                kfree(pages);
+        kfree(pages);
-        }
+out:
        return ret;
 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae33..b7e6ac706b87 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_mode = mode;
                inode->i_uid = current_fsuid();
                inode->i_gid = current_fsgid();
-                inode->i_blocks = 0;
                inode->i_mapping->a_ops = &ramfs_aops;
                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020b..5cc6924eb158 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
                offset += inode->i_size;
                break;
        case SEEK_CUR:
+                /*
+                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+                 * position-querying operation.  Avoid rewriting the "same"
+                 * f_pos value back to the file because a concurrent read(),
+                 * write() or lseek() might have altered it
+                 */
+                if (offset == 0)
+                        return file->f_pos;
                offset += file->f_pos;
                break;
        }
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                        offset += i_size_read(file->f_path.dentry->d_inode);
                        break;
                case SEEK_CUR:
+                        if (offset == 0) {
+                                retval = file->f_pos;
+                                goto out;
+                        }
                        offset += file->f_pos;
        }
        retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
                }
                retval = offset;
        }
+out:
        unlock_kernel();
        return retval;
 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6c4c2c69449f..55fce92cdf18 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                       struct inode *inode)
 {
        struct super_block *sb;
+        struct reiserfs_iget_args args;
        INITIALIZE_PATH(path_to_key);
        struct cpu_key key;
        struct item_head ih;
@@ -1780,6 +1781,20 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                err = -ENOMEM;
                goto out_bad_inode;
        }
+        args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+        if (old_format_only(sb))
+                make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+                                  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+        else
+                make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+                                  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
+        memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+        args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
+        if (insert_inode_locked4(inode, args.objectid,
+                             reiserfs_find_actor, &args) < 0) {
+                err = -EINVAL;
+                goto out_bad_inode;
+        }
        if (old_format_only(sb))
                /* not a perfect generation count, as object ids can be reused, but 
                 ** this is as good as reiserfs can do right now.
@@ -1825,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        reiserfs_init_acl_default(inode);
        reiserfs_init_xattr_rwsem(inode);
-        if (old_format_only(sb))
-                make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
-                                  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
-        else
-                make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
-                                  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
        /* key to search for correct place for new stat data */
        _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
                      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
@@ -1859,13 +1867,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        } else {
                inode2sd(&sd, inode, inode->i_size);
        }
-        // these do not go to on-disk stat data
-        inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
        // store in in-core inode the key of stat data and version all
        // object items will have (directory items will have old offset
        // format, other new objects will consist of new items)
-        memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
        if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
                set_inode_item_key_version(inode, KEY_FORMAT_3_5);
        else
@@ -1929,7 +1933,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
                reiserfs_mark_inode_private(inode);
        }
-        insert_inode_hash(inode);
        reiserfs_update_sd(th, inode);
        reiserfs_check_path(&path_to_key);
@@ -1956,6 +1959,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
      out_inserted_sd:
        inode->i_nlink = 0;
        th->t_trans_id = 0;     /* so the caller can't use this handle later */
+        unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
        /* If we were inheriting an ACL, we need to release the lock so that
         * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
@@ -2556,7 +2560,7 @@ static int reiserfs_write_begin(struct file *file,
        }
        index = pos >> PAGE_CACHE_SHIFT;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
        *pagep = page;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 4f322e5ed840..738967f6c8ee 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
@@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
        reiserfs_update_inode_transaction(dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
      out_failed:
@@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
      out_failed:
@@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
                err = journal_end(&th, dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
@@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        reiserfs_update_sd(&th, dir);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, dir->i_sb, jbegin_count);
      out_failed:
        if (locked)
@@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
                err = journal_end(&th, parent_dir->i_sb, jbegin_count);
                if (err)
                        retval = err;
+                unlock_new_inode(inode);
                iput(inode);
                goto out_failed;
        }
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
      out_failed:
        reiserfs_write_unlock(parent_dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce8..f3c820b75829 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s)
        reiserfs_sync_fs(s, 1);
 }
-static void reiserfs_write_super_lockfs(struct super_block *s)
+static int reiserfs_freeze(struct super_block *s)
 {
        struct reiserfs_transaction_handle th;
        reiserfs_write_lock(s);
@@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s)
        }
        s->s_dirt = 0;
        reiserfs_write_unlock(s);
+        return 0;
 }
-static void reiserfs_unlockfs(struct super_block *s)
+static int reiserfs_unfreeze(struct super_block *s)
 {
        reiserfs_allow_writes(s);
+        return 0;
 }
 extern const struct in_core_key MAX_IN_CORE_KEY;
@@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = {
        .put_super = reiserfs_put_super,
        .write_super = reiserfs_write_super,
        .sync_fs = reiserfs_sync_fs,
-        .write_super_lockfs = reiserfs_write_super_lockfs,
+        .freeze_fs = reiserfs_freeze,
-        .unlockfs = reiserfs_unlockfs,
+        .unfreeze_fs = reiserfs_unfreeze,
        .statfs = reiserfs_statfs,
        .remount_fs = reiserfs_remount,
        .show_options = generic_show_options,
@@ -649,6 +651,8 @@ static struct dquot_operations reiserfs_quota_operations = {
        .release_dquot = reiserfs_release_dquot,
        .mark_dirty = reiserfs_mark_dquot_dirty,
        .write_info = reiserfs_write_info,
+        .alloc_dquot    = dquot_alloc,
+        .destroy_dquot  = dquot_destroy,
 };
 static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +998,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                if (c == 'u' || c == 'g') {
                        int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
-                        if ((sb_any_quota_enabled(s) ||
+                        if (sb_any_quota_loaded(s) &&
-                             sb_any_quota_suspended(s)) &&
                            (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
                                reiserfs_warning(s,
                                                 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1044,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                                                 "reiserfs_parse_options: unknown quota format specified.");
                                return 0;
                        }
-                        if ((sb_any_quota_enabled(s) ||
+                        if (sb_any_quota_loaded(s) &&
-                             sb_any_quota_suspended(s)) &&
                            *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
                                reiserfs_warning(s,
                                                 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1069,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
        }
        /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
        if (!(*mount_options & (1 << REISERFS_QUOTA))
-            && sb_any_quota_enabled(s)) {
+            && sb_any_quota_loaded(s)) {
                reiserfs_warning(s,
                                 "reiserfs_parse_options: quota options must be present when quota is turned on.");
                return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87b..98a232f7196b 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
 static struct inode *
 romfs_iget(struct super_block *sb, unsigned long ino)
 {
-        int nextfh;
+        int nextfh, ret;
        struct romfs_inode ri;
        struct inode *i;
@@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino)
        i->i_size = be32_to_cpu(ri.size);
        i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
        i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-        i->i_uid = i->i_gid = 0;
        /* Precalculate the data offset */
-        ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN);
+        ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-        if (ino >= 0)
+        if (ret >= 0)
-                ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK);
+                ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-        else
+        else
-                ino = 0;
+                ino = 0;
        ROMFS_I(i)->i_metasize = ino;
        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf2..08b91beed806 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 void poll_initwait(struct poll_wqueues *pwq)
 {
        init_poll_funcptr(&pwq->pt, __pollwait);
+        pwq->polling_task = current;
        pwq->error = 0;
        pwq->table = NULL;
        pwq->inline_index = 0;
 }
 EXPORT_SYMBOL(poll_initwait);
 static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
                free_page((unsigned long) old);
        }
 }
 EXPORT_SYMBOL(poll_freewait);
-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 {
-        struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
        struct poll_table_page *table = p->table;
        if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
                new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
                if (!new_table) {
                        p->error = -ENOMEM;
-                        __set_current_state(TASK_RUNNING);
                        return NULL;
                }
                new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
        return table->entry++;
 }
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+        struct poll_wqueues *pwq = wait->private;
+        DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+        /*
+         * Although this function is called under waitqueue lock, LOCK
+         * doesn't imply write barrier and the users expect write
+         * barrier semantics on wakeup functions.  The following
+         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+         * and is paired with set_mb() in poll_schedule_timeout.
+         */
+        smp_wmb();
+        pwq->triggered = 1;
+        /*
+         * Perform the default wake up operation using a dummy
+         * waitqueue.
+         *
+         * TODO: This is hacky but there currently is no interface to
+         * pass in @sync.  @sync is scheduled to be removed and once
+         * that happens, wake_up_process() can be used directly.
+         */
+        return default_wake_function(&dummy_wait, mode, sync, key);
+}
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                                poll_table *p)
 {
-        struct poll_table_entry *entry = poll_get_entry(p);
+        struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+        struct poll_table_entry *entry = poll_get_entry(pwq);
        if (!entry)
                return;
        get_file(filp);
        entry->filp = filp;
        entry->wait_address = wait_address;
-        init_waitqueue_entry(&entry->wait, current);
+        init_waitqueue_func_entry(&entry->wait, pollwake);
+        entry->wait.private = pwq;
        add_wait_queue(wait_address, &entry->wait);
 }
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+                          ktime_t *expires, unsigned long slack)
+{
+        int rc = -EINTR;
+        set_current_state(state);
+        if (!pwq->triggered)
+                rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+        __set_current_state(TASK_RUNNING);
+        /*
+         * Prepare for the next iteration.
+         *
+         * The following set_mb() serves two purposes.  First, it's
+         * the counterpart rmb of the wmb in pollwake() such that data
+         * written before wake up is always visible after wake up.
+         * Second, the full barrier guarantees that triggered clearing
+         * doesn't pass event check of the next iteration.  Note that
+         * this problem doesn't exist for the first iteration as
+         * add_wait_queue() has full barrier semantics.
+         */
+        set_mb(pwq->triggered, 0);
+        return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
 /**
 * poll_select_set_timeout - helper function to setup the timeout value
 * @to:         pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
        for (;;) {
                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
-                set_current_state(TASK_INTERRUPTIBLE);
                inp = fds->in; outp = fds->out; exp = fds->ex;
                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
                        to = &expire;
                }
-                if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+                                           to, slack))
                        timed_out = 1;
        }
-        __set_current_state(TASK_RUNNING);
        poll_freewait(&table);
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
        for (;;) {
                struct poll_list *walk;
-                set_current_state(TASK_INTERRUPTIBLE);
                for (walk = list; walk != NULL; walk = walk->next) {
                        struct pollfd * pfd, * pfd_end;
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
                        to = &expire;
                }
-                if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
                        timed_out = 1;
        }
-        __set_current_state(TASK_RUNNING);
        return count;
 }
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 16c211558c22..b569ff1c4dc8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -389,8 +389,14 @@ char *mangle_path(char *s, char *p, char *esc)
 }
 EXPORT_SYMBOL(mangle_path);
-/*
+/**
- * return the absolute path of 'dentry' residing in mount 'mnt'.
+ * seq_path - seq_file interface to print a pathname
+ * @m: the seq_file handle
+ * @path: the struct path to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path of 'path', as represented by the
+ * dentry / mnt pair in the path parameter.
 */
 int seq_path(struct seq_file *m, struct path *path, char *esc)
 {
@@ -462,7 +468,8 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
        return -1;
 }
-int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+int seq_bitmap(struct seq_file *m, const unsigned long *bits,
+                                   unsigned int nr_bits)
 {
        if (m->count < m->size) {
                int len = bitmap_scnprintf(m->buf + m->count,
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index e4f8d51a5553..92d5e8ffb639 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping,
                        struct page **pagep, void **fsdata)
 {
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        *pagep = __grab_cache_page(mapping, index);
+        *pagep = grab_cache_page_write_begin(mapping, index, flags);
        if (!*pagep)
                return -ENOMEM;
        return 0;
diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4ba..a54b3e3f10a7 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
+#include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
new file mode 100644
index 000000000000..8258cf9a0317
--- /dev/null
+++ b/fs/squashfs/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the linux squashfs routines.
+#
+obj-$(CONFIG_SQUASHFS) += squashfs.o
+squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
+squashfs-y += namei.o super.o symlink.o
+#squashfs-y += squashfs2_0.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
new file mode 100644
index 000000000000..c837dfc2b3c6
--- /dev/null
+++ b/fs/squashfs/block.c
@@ -0,0 +1,274 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * block.c
+ */
+/*
+ * This file implements the low-level routines to read and decompress
+ * datablocks and metadata blocks.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Read the metadata block length, this is stored in the first two
+ * bytes of the metadata block.
+ */
+static struct buffer_head *get_block_length(struct super_block *sb,
+                        u64 *cur_index, int *offset, int *length)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        struct buffer_head *bh;
+        bh = sb_bread(sb, *cur_index);
+        if (bh == NULL)
+                return NULL;
+        if (msblk->devblksize - *offset == 1) {
+                *length = (unsigned char) bh->b_data[*offset];
+                put_bh(bh);
+                bh = sb_bread(sb, ++(*cur_index));
+                if (bh == NULL)
+                        return NULL;
+                *length |= (unsigned char) bh->b_data[0] << 8;
+                *offset = 1;
+        } else {
+                *length = (unsigned char) bh->b_data[*offset] |
+                        (unsigned char) bh->b_data[*offset + 1] << 8;
+                *offset += 2;
+        }
+        return bh;
+}
+/*
+ * Read and decompress a metadata block or datablock.  Length is non-zero
+ * if a datablock is being read (the size is stored elsewhere in the
+ * filesystem), otherwise the length is obtained from the first two bytes of
+ * the metadata block.  A bit in the length field indicates if the block
+ * is stored uncompressed in the filesystem (usually because compression
+ * generated a larger block - this does occasionally happen with zlib).
+ */
+int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
+                        int length, u64 *next_index, int srclength)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        struct buffer_head **bh;
+        int offset = index & ((1 << msblk->devblksize_log2) - 1);
+        u64 cur_index = index >> msblk->devblksize_log2;
+        int bytes, compressed, b = 0, k = 0, page = 0, avail;
+        bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
+                                sizeof(*bh), GFP_KERNEL);
+        if (bh == NULL)
+                return -ENOMEM;
+        if (length) {
+                /*
+                 * Datablock.
+                 */
+                bytes = -offset;
+                compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+                length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+                if (next_index)
+                        *next_index = index + length;
+                TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
+                        index, compressed ? "" : "un", length, srclength);
+                if (length < 0 || length > srclength ||
+                                (index + length) > msblk->bytes_used)
+                        goto read_failure;
+                for (b = 0; bytes < length; b++, cur_index++) {
+                        bh[b] = sb_getblk(sb, cur_index);
+                        if (bh[b] == NULL)
+                                goto block_release;
+                        bytes += msblk->devblksize;
+                }
+                ll_rw_block(READ, b, bh);
+        } else {
+                /*
+                 * Metadata block.
+                 */
+                if ((index + 2) > msblk->bytes_used)
+                        goto read_failure;
+                bh[0] = get_block_length(sb, &cur_index, &offset, &length);
+                if (bh[0] == NULL)
+                        goto read_failure;
+                b = 1;
+                bytes = msblk->devblksize - offset;
+                compressed = SQUASHFS_COMPRESSED(length);
+                length = SQUASHFS_COMPRESSED_SIZE(length);
+                if (next_index)
+                        *next_index = index + length + 2;
+                TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+                                compressed ? "" : "un", length);
+                if (length < 0 || length > srclength ||
+                                        (index + length) > msblk->bytes_used)
+                        goto block_release;
+                for (; bytes < length; b++) {
+                        bh[b] = sb_getblk(sb, ++cur_index);
+                        if (bh[b] == NULL)
+                                goto block_release;
+                        bytes += msblk->devblksize;
+                }
+                ll_rw_block(READ, b - 1, bh + 1);
+        }
+        if (compressed) {
+                int zlib_err = 0, zlib_init = 0;
+                /*
+                 * Uncompress block.
+                 */
+                mutex_lock(&msblk->read_data_mutex);
+                msblk->stream.avail_out = 0;
+                msblk->stream.avail_in = 0;
+                bytes = length;
+                do {
+                        if (msblk->stream.avail_in == 0 && k < b) {
+                                avail = min(bytes, msblk->devblksize - offset);
+                                bytes -= avail;
+                                wait_on_buffer(bh[k]);
+                                if (!buffer_uptodate(bh[k]))
+                                        goto release_mutex;
+                                if (avail == 0) {
+                                        offset = 0;
+                                        put_bh(bh[k++]);
+                                        continue;
+                                }
+                                msblk->stream.next_in = bh[k]->b_data + offset;
+                                msblk->stream.avail_in = avail;
+                                offset = 0;
+                        }
+                        if (msblk->stream.avail_out == 0) {
+                                msblk->stream.next_out = buffer[page++];
+                                msblk->stream.avail_out = PAGE_CACHE_SIZE;
+                        }
+                        if (!zlib_init) {
+                                zlib_err = zlib_inflateInit(&msblk->stream);
+                                if (zlib_err != Z_OK) {
+                                        ERROR("zlib_inflateInit returned"
+                                                " unexpected result 0x%x,"
+                                                " srclength %d\n", zlib_err,
+                                                srclength);
+                                        goto release_mutex;
+                                }
+                                zlib_init = 1;
+                        }
+                        zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
+                        if (msblk->stream.avail_in == 0 && k < b)
+                                put_bh(bh[k++]);
+                } while (zlib_err == Z_OK);
+                if (zlib_err != Z_STREAM_END) {
+                        ERROR("zlib_inflate returned unexpected result"
+                                " 0x%x, srclength %d, avail_in %d,"
+                                " avail_out %d\n", zlib_err, srclength,
+                                msblk->stream.avail_in,
+                                msblk->stream.avail_out);
+                        goto release_mutex;
+                }
+                zlib_err = zlib_inflateEnd(&msblk->stream);
+                if (zlib_err != Z_OK) {
+                        ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
+                                " srclength %d\n", zlib_err, srclength);
+                        goto release_mutex;
+                }
+                length = msblk->stream.total_out;
+                mutex_unlock(&msblk->read_data_mutex);
+        } else {
+                /*
+                 * Block is uncompressed.
+                 */
+                int i, in, pg_offset = 0;
+                for (i = 0; i < b; i++) {
+                        wait_on_buffer(bh[i]);
+                        if (!buffer_uptodate(bh[i]))
+                                goto block_release;
+                }
+                for (bytes = length; k < b; k++) {
+                        in = min(bytes, msblk->devblksize - offset);
+                        bytes -= in;
+                        while (in) {
+                                if (pg_offset == PAGE_CACHE_SIZE) {
+                                        page++;
+                                        pg_offset = 0;
+                                }
+                                avail = min_t(int, in, PAGE_CACHE_SIZE -
+                                                pg_offset);
+                                memcpy(buffer[page] + pg_offset,
+                                                bh[k]->b_data + offset, avail);
+                                in -= avail;
+                                pg_offset += avail;
+                                offset += avail;
+                        }
+                        offset = 0;
+                        put_bh(bh[k]);
+                }
+        }
+        kfree(bh);
+        return length;
+release_mutex:
+        mutex_unlock(&msblk->read_data_mutex);
+block_release:
+        for (; k < b; k++)
+                put_bh(bh[k]);
+read_failure:
+        ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+        kfree(bh);
+        return -EIO;
+}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 000000000000..f29eda16d25e
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,412 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * cache.c
+ */
+/*
+ * Blocks in Squashfs are compressed.  To avoid repeatedly decompressing
+ * recently accessed data Squashfs uses two small metadata and fragment caches.
+ *
+ * This file implements a generic cache implementation used for both caches,
+ * plus functions layered ontop of the generic cache implementation to
+ * access the metadata and fragment caches.
+ *
+ * To avoid out of memory and fragmentation isssues with vmalloc the cache
+ * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
+ *
+ * It should be noted that the cache is not used for file datablocks, these
+ * are decompressed and cached in the page-cache in the normal way.  The
+ * cache is only used to temporarily cache fragment and metadata blocks
+ * which have been read as as a result of a metadata (i.e. inode or
+ * directory) or fragment access.  Because metadata and fragments are packed
+ * together into blocks (to gain greater compression) the read of a particular
+ * piece of metadata or fragment will retrieve other metadata/fragments which
+ * have been packed with it, these because of locality-of-reference may be read
+ * in the near future. Temporarily caching them ensures they are available for
+ * near future access without requiring an additional read and decompress.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/zlib.h>
+#include <linux/pagemap.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Look-up block in cache, and increment usage count.  If not in cache, read
+ * and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
+        struct squashfs_cache *cache, u64 block, int length)
+{
+        int i, n;
+        struct squashfs_cache_entry *entry;
+        spin_lock(&cache->lock);
+        while (1) {
+                for (i = 0; i < cache->entries; i++)
+                        if (cache->entry[i].block == block)
+                                break;
+                if (i == cache->entries) {
+                        /*
+                         * Block not in cache, if all cache entries are used
+                         * go to sleep waiting for one to become available.
+                         */
+                        if (cache->unused == 0) {
+                                cache->num_waiters++;
+                                spin_unlock(&cache->lock);
+                                wait_event(cache->wait_queue, cache->unused);
+                                spin_lock(&cache->lock);
+                                cache->num_waiters--;
+                                continue;
+                        }
+                        /*
+                         * At least one unused cache entry.  A simple
+                         * round-robin strategy is used to choose the entry to
+                         * be evicted from the cache.
+                         */
+                        i = cache->next_blk;
+                        for (n = 0; n < cache->entries; n++) {
+                                if (cache->entry[i].refcount == 0)
+                                        break;
+                                i = (i + 1) % cache->entries;
+                        }
+                        cache->next_blk = (i + 1) % cache->entries;
+                        entry = &cache->entry[i];
+                        /*
+                         * Initialise choosen cache entry, and fill it in from
+                         * disk.
+                         */
+                        cache->unused--;
+                        entry->block = block;
+                        entry->refcount = 1;
+                        entry->pending = 1;
+                        entry->num_waiters = 0;
+                        entry->error = 0;
+                        spin_unlock(&cache->lock);
+                        entry->length = squashfs_read_data(sb, entry->data,
+                                block, length, &entry->next_index,
+                                cache->block_size);
+                        spin_lock(&cache->lock);
+                        if (entry->length < 0)
+                                entry->error = entry->length;
+                        entry->pending = 0;
+                        /*
+                         * While filling this entry one or more other processes
+                         * have looked it up in the cache, and have slept
+                         * waiting for it to become available.
+                         */
+                        if (entry->num_waiters) {
+                                spin_unlock(&cache->lock);
+                                wake_up_all(&entry->wait_queue);
+                        } else
+                                spin_unlock(&cache->lock);
+                        goto out;
+                }
+                /*
+                 * Block already in cache.  Increment refcount so it doesn't
+                 * get reused until we're finished with it, if it was
+                 * previously unused there's one less cache entry available
+                 * for reuse.
+                 */
+                entry = &cache->entry[i];
+                if (entry->refcount == 0)
+                        cache->unused--;
+                entry->refcount++;
+                /*
+                 * If the entry is currently being filled in by another process
+                 * go to sleep waiting for it to become available.
+                 */
+                if (entry->pending) {
+                        entry->num_waiters++;
+                        spin_unlock(&cache->lock);
+                        wait_event(entry->wait_queue, !entry->pending);
+                } else
+                        spin_unlock(&cache->lock);
+                goto out;
+        }
+out:
+        TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
+                cache->name, i, entry->block, entry->refcount, entry->error);
+        if (entry->error)
+                ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
+                                                        block);
+        return entry;
+}
+/*
+ * Release cache entry, once usage count is zero it can be reused.
+ */
+void squashfs_cache_put(struct squashfs_cache_entry *entry)
+{
+        struct squashfs_cache *cache = entry->cache;
+        spin_lock(&cache->lock);
+        entry->refcount--;
+        if (entry->refcount == 0) {
+                cache->unused++;
+                /*
+                 * If there's any processes waiting for a block to become
+                 * available, wake one up.
+                 */
+                if (cache->num_waiters) {
+                        spin_unlock(&cache->lock);
+                        wake_up(&cache->wait_queue);
+                        return;
+                }
+        }
+        spin_unlock(&cache->lock);
+}
+/*
+ * Delete cache reclaiming all kmalloced buffers.
+ */
+void squashfs_cache_delete(struct squashfs_cache *cache)
+{
+        int i, j;
+        if (cache == NULL)
+                return;
+        for (i = 0; i < cache->entries; i++) {
+                if (cache->entry[i].data) {
+                        for (j = 0; j < cache->pages; j++)
+                                kfree(cache->entry[i].data[j]);
+                        kfree(cache->entry[i].data);
+                }
+        }
+        kfree(cache->entry);
+        kfree(cache);
+}
+/*
+ * Initialise cache allocating the specified number of entries, each of
+ * size block_size.  To avoid vmalloc fragmentation issues each entry
+ * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
+ */
+struct squashfs_cache *squashfs_cache_init(char *name, int entries,
+        int block_size)
+{
+        int i, j;
+        struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+        if (cache == NULL) {
+                ERROR("Failed to allocate %s cache\n", name);
+                return NULL;
+        }
+        cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
+        if (cache->entry == NULL) {
+                ERROR("Failed to allocate %s cache\n", name);
+                goto cleanup;
+        }
+        cache->next_blk = 0;
+        cache->unused = entries;
+        cache->entries = entries;
+        cache->block_size = block_size;
+        cache->pages = block_size >> PAGE_CACHE_SHIFT;
+        cache->name = name;
+        cache->num_waiters = 0;
+        spin_lock_init(&cache->lock);
+        init_waitqueue_head(&cache->wait_queue);
+        for (i = 0; i < entries; i++) {
+                struct squashfs_cache_entry *entry = &cache->entry[i];
+                init_waitqueue_head(&cache->entry[i].wait_queue);
+                entry->cache = cache;
+                entry->block = SQUASHFS_INVALID_BLK;
+                entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
+                if (entry->data == NULL) {
+                        ERROR("Failed to allocate %s cache entry\n", name);
+                        goto cleanup;
+                }
+                for (j = 0; j < cache->pages; j++) {
+                        entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+                        if (entry->data[j] == NULL) {
+                                ERROR("Failed to allocate %s buffer\n", name);
+                                goto cleanup;
+                        }
+                }
+        }
+        return cache;
+cleanup:
+        squashfs_cache_delete(cache);
+        return NULL;
+}
+/*
+ * Copy upto length bytes from cache entry to buffer starting at offset bytes
+ * into the cache entry.  If there's not length bytes then copy the number of
+ * bytes available.  In all cases return the number of bytes copied.
+ */
+int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
+                int offset, int length)
+{
+        int remaining = length;
+        if (length == 0)
+                return 0;
+        else if (buffer == NULL)
+                return min(length, entry->length - offset);
+        while (offset < entry->length) {
+                void *buff = entry->data[offset / PAGE_CACHE_SIZE]
+                                + (offset % PAGE_CACHE_SIZE);
+                int bytes = min_t(int, entry->length - offset,
+                                PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+                if (bytes >= remaining) {
+                        memcpy(buffer, buff, remaining);
+                        remaining = 0;
+                        break;
+                }
+                memcpy(buffer, buff, bytes);
+                buffer += bytes;
+                remaining -= bytes;
+                offset += bytes;
+        }
+        return length - remaining;
+}
+/*
+ * Read length bytes from metadata position <block, offset> (block is the
+ * start of the compressed block on disk, and offset is the offset into
+ * the block once decompressed).  Data is packed into consecutive blocks,
+ * and length bytes may require reading more than one block.
+ */
+int squashfs_read_metadata(struct super_block *sb, void *buffer,
+                u64 *block, int *offset, int length)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int bytes, copied = length;
+        struct squashfs_cache_entry *entry;
+        TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
+        while (length) {
+                entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
+                if (entry->error)
+                        return entry->error;
+                else if (*offset >= entry->length)
+                        return -EIO;
+                bytes = squashfs_copy_data(buffer, entry, *offset, length);
+                if (buffer)
+                        buffer += bytes;
+                length -= bytes;
+                *offset += bytes;
+                if (*offset == entry->length) {
+                        *block = entry->next_index;
+                        *offset = 0;
+                }
+                squashfs_cache_put(entry);
+        }
+        return copied;
+}
+/*
+ * Look-up in the fragmment cache the fragment located at <start_block> in the
+ * filesystem.  If necessary read and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
+                                u64 start_block, int length)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
+                length);
+}
+/*
+ * Read and decompress the datablock located at <start_block> in the
+ * filesystem.  The cache is used here to avoid duplicating locking and
+ * read/decompress code.
+ */
+struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
+                                u64 start_block, int length)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        return squashfs_cache_get(sb, msblk->read_page, start_block, length);
+}
+/*
+ * Read a filesystem table (uncompressed sequence of bytes) from disk
+ */
+int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
+        int length)
+{
+        int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        int i, res;
+        void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+        if (data == NULL)
+                return -ENOMEM;
+        for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+                data[i] = buffer;
+        res = squashfs_read_data(sb, data, block, length |
+                SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
+        kfree(data);
+        return res;
+}
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
new file mode 100644
index 000000000000..566b0eaed868
--- /dev/null
+++ b/fs/squashfs/dir.c
@@ -0,0 +1,235 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * dir.c
+ */
+/*
+ * This file implements code to read directories from disk.
+ *
+ * See namei.c for a description of directory organisation on disk.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static const unsigned char squashfs_filetype_table[] = {
+        DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK
+};
+/*
+ * Lookup offset (f_pos) in the directory index, returning the
+ * metadata block containing it.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_offset(struct super_block *sb,
+        u64 *next_block, int *next_offset, u64 index_start, int index_offset,
+        int i_count, u64 f_pos)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int err, i, index, length = 0;
+        struct squashfs_dir_index dir_index;
+        TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
+                                        i_count, f_pos);
+        /*
+         * Translate from external f_pos to the internal f_pos.  This
+         * is offset by 3 because we invent "." and ".." entries which are
+         * not actually stored in the directory.
+         */
+        if (f_pos < 3)
+                return f_pos;
+        f_pos -= 3;
+        for (i = 0; i < i_count; i++) {
+                err = squashfs_read_metadata(sb, &dir_index, &index_start,
+                                &index_offset, sizeof(dir_index));
+                if (err < 0)
+                        break;
+                index = le32_to_cpu(dir_index.index);
+                if (index > f_pos)
+                        /*
+                         * Found the index we're looking for.
+                         */
+                        break;
+                err = squashfs_read_metadata(sb, NULL, &index_start,
+                                &index_offset, le32_to_cpu(dir_index.size) + 1);
+                if (err < 0)
+                        break;
+                length = index;
+                *next_block = le32_to_cpu(dir_index.start_block) +
+                                        msblk->directory_table;
+        }
+        *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+        /*
+         * Translate back from internal f_pos to external f_pos.
+         */
+        return length + 3;
+}
+static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        u64 block = squashfs_i(inode)->start + msblk->directory_table;
+        int offset = squashfs_i(inode)->offset, length = 0, dir_count, size,
+                                type, err;
+        unsigned int inode_number;
+        struct squashfs_dir_header dirh;
+        struct squashfs_dir_entry *dire;
+        TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset);
+        dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+        if (dire == NULL) {
+                ERROR("Failed to allocate squashfs_dir_entry\n");
+                goto finish;
+        }
+        /*
+         * Return "." and  ".." entries as the first two filenames in the
+         * directory.  To maximise compression these two entries are not
+         * stored in the directory, and so we invent them here.
+         *
+         * It also means that the external f_pos is offset by 3 from the
+         * on-disk directory f_pos.
+         */
+        while (file->f_pos < 3) {
+                char *name;
+                int i_ino;
+                if (file->f_pos == 0) {
+                        name = ".";
+                        size = 1;
+                        i_ino = inode->i_ino;
+                } else {
+                        name = "..";
+                        size = 2;
+                        i_ino = squashfs_i(inode)->parent;
+                }
+                TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
+                                dirent, name, size, file->f_pos, i_ino,
+                                squashfs_filetype_table[1]);
+                if (filldir(dirent, name, size, file->f_pos, i_ino,
+                                squashfs_filetype_table[1]) < 0) {
+                                TRACE("Filldir returned less than 0\n");
+                        goto finish;
+                }
+                file->f_pos += size;
+        }
+        length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
+                                squashfs_i(inode)->dir_idx_start,
+                                squashfs_i(inode)->dir_idx_offset,
+                                squashfs_i(inode)->dir_idx_cnt,
+                                file->f_pos);
+        while (length < i_size_read(inode)) {
+                /*
+                 * Read directory header
+                 */
+                err = squashfs_read_metadata(inode->i_sb, &dirh, &block,
+                                        &offset, sizeof(dirh));
+                if (err < 0)
+                        goto failed_read;
+                length += sizeof(dirh);
+                dir_count = le32_to_cpu(dirh.count) + 1;
+                while (dir_count--) {
+                        /*
+                         * Read directory entry.
+                         */
+                        err = squashfs_read_metadata(inode->i_sb, dire, &block,
+                                        &offset, sizeof(*dire));
+                        if (err < 0)
+                                goto failed_read;
+                        size = le16_to_cpu(dire->size) + 1;
+                        err = squashfs_read_metadata(inode->i_sb, dire->name,
+                                        &block, &offset, size);
+                        if (err < 0)
+                                goto failed_read;
+                        length += sizeof(*dire) + size;
+                        if (file->f_pos >= length)
+                                continue;
+                        dire->name[size] = '\0';
+                        inode_number = le32_to_cpu(dirh.inode_number) +
+                                ((short) le16_to_cpu(dire->inode_number));
+                        type = le16_to_cpu(dire->type);
+                        TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
+                                        "\n", dirent, dire->name, size,
+                                        file->f_pos,
+                                        le32_to_cpu(dirh.start_block),
+                                        le16_to_cpu(dire->offset),
+                                        inode_number,
+                                        squashfs_filetype_table[type]);
+                        if (filldir(dirent, dire->name, size, file->f_pos,
+                                        inode_number,
+                                        squashfs_filetype_table[type]) < 0) {
+                                TRACE("Filldir returned less than 0\n");
+                                goto finish;
+                        }
+                        file->f_pos = length;
+                }
+        }
+finish:
+        kfree(dire);
+        return 0;
+failed_read:
+        ERROR("Unable to read directory block [%llx:%x]\n", block, offset);
+        kfree(dire);
+        return 0;
+}
+const struct file_operations squashfs_dir_ops = {
+        .read = generic_read_dir,
+        .readdir = squashfs_readdir
+};
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
new file mode 100644
index 000000000000..69e971d5ddc1
--- /dev/null
+++ b/fs/squashfs/export.c
@@ -0,0 +1,155 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * export.c
+ */
+/*
+ * This file implements code to make Squashfs filesystems exportable (NFS etc.)
+ *
+ * The export code uses an inode lookup table to map inode numbers passed in
+ * filehandles to an inode location on disk.  This table is stored compressed
+ * into metadata blocks.  A second index table is used to locate these.  This
+ * second index table for speed of access (and because it is small) is read at
+ * mount time and cached in memory.
+ *
+ * The inode lookup table is used only by the export code, inode disk
+ * locations are directly encoded in directories, enabling direct access
+ * without an intermediate lookup for all operations except the export ops.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Look-up inode number (ino) in table, returning the inode location.
+ */
+static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
+        int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
+        u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+        __le64 ino;
+        int err;
+        TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
+        err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
+        if (err < 0)
+                return err;
+        TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
+                (u64) le64_to_cpu(ino));
+        return le64_to_cpu(ino);
+}
+static struct dentry *squashfs_export_iget(struct super_block *sb,
+        unsigned int ino_num)
+{
+        long long ino;
+        struct dentry *dentry = ERR_PTR(-ENOENT);
+        TRACE("Entered squashfs_export_iget\n");
+        ino = squashfs_inode_lookup(sb, ino_num);
+        if (ino >= 0)
+                dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));
+        return dentry;
+}
+static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
+                struct fid *fid, int fh_len, int fh_type)
+{
+        if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
+                        || fh_len < 2)
+                return NULL;
+        return squashfs_export_iget(sb, fid->i32.ino);
+}
+static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
+                struct fid *fid, int fh_len, int fh_type)
+{
+        if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
+                return NULL;
+        return squashfs_export_iget(sb, fid->i32.parent_ino);
+}
+static struct dentry *squashfs_get_parent(struct dentry *child)
+{
+        struct inode *inode = child->d_inode;
+        unsigned int parent_ino = squashfs_i(inode)->parent;
+        return squashfs_export_iget(inode->i_sb, parent_ino);
+}
+/*
+ * Read uncompressed inode lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
+                u64 lookup_table_start, unsigned int inodes)
+{
+        unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
+        __le64 *inode_lookup_table;
+        int err;
+        TRACE("In read_inode_lookup_table, length %d\n", length);
+        /* Allocate inode lookup table indexes */
+        inode_lookup_table = kmalloc(length, GFP_KERNEL);
+        if (inode_lookup_table == NULL) {
+                ERROR("Failed to allocate inode lookup table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start,
+                        length);
+        if (err < 0) {
+                ERROR("unable to read inode lookup table\n");
+                kfree(inode_lookup_table);
+                return ERR_PTR(err);
+        }
+        return inode_lookup_table;
+}
+const struct export_operations squashfs_export_ops = {
+        .fh_to_dentry = squashfs_fh_to_dentry,
+        .fh_to_parent = squashfs_fh_to_parent,
+        .get_parent = squashfs_get_parent
+};
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
new file mode 100644
index 000000000000..717767d831df
--- /dev/null
+++ b/fs/squashfs/file.c
@@ -0,0 +1,502 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * file.c
+ */
+/*
+ * This file contains code for handling regular files.  A regular file
+ * consists of a sequence of contiguous compressed blocks, and/or a
+ * compressed fragment block (tail-end packed block).   The compressed size
+ * of each datablock is stored in a block list contained within the
+ * file inode (itself stored in one or more compressed metadata blocks).
+ *
+ * To speed up access to datablocks when reading 'large' files (256 Mbytes or
+ * larger), the code implements an index cache that caches the mapping from
+ * block index to datablock location on disk.
+ *
+ * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
+ * retaining a simple and space-efficient block list on disk.  The cache
+ * is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
+ * Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
+ * The index cache is designed to be memory efficient, and by default uses
+ * 16 KiB.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Locate cache slot in range [offset, index] for specified inode.  If
+ * there's more than one return the slot closest to index.
+ */
+static struct meta_index *locate_meta_index(struct inode *inode, int offset,
+                                int index)
+{
+        struct meta_index *meta = NULL;
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        int i;
+        mutex_lock(&msblk->meta_index_mutex);
+        TRACE("locate_meta_index: index %d, offset %d\n", index, offset);
+        if (msblk->meta_index == NULL)
+                goto not_allocated;
+        for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+                if (msblk->meta_index[i].inode_number == inode->i_ino &&
+                                msblk->meta_index[i].offset >= offset &&
+                                msblk->meta_index[i].offset <= index &&
+                                msblk->meta_index[i].locked == 0) {
+                        TRACE("locate_meta_index: entry %d, offset %d\n", i,
+                                        msblk->meta_index[i].offset);
+                        meta = &msblk->meta_index[i];
+                        offset = meta->offset;
+                }
+        }
+        if (meta)
+                meta->locked = 1;
+not_allocated:
+        mutex_unlock(&msblk->meta_index_mutex);
+        return meta;
+}
+/*
+ * Find and initialise an empty cache slot for index offset.
+ */
+static struct meta_index *empty_meta_index(struct inode *inode, int offset,
+                                int skip)
+{
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        struct meta_index *meta = NULL;
+        int i;
+        mutex_lock(&msblk->meta_index_mutex);
+        TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip);
+        if (msblk->meta_index == NULL) {
+                /*
+                 * First time cache index has been used, allocate and
+                 * initialise.  The cache index could be allocated at
+                 * mount time but doing it here means it is allocated only
+                 * if a 'large' file is read.
+                 */
+                msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS,
+                        sizeof(*(msblk->meta_index)), GFP_KERNEL);
+                if (msblk->meta_index == NULL) {
+                        ERROR("Failed to allocate meta_index\n");
+                        goto failed;
+                }
+                for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+                        msblk->meta_index[i].inode_number = 0;
+                        msblk->meta_index[i].locked = 0;
+                }
+                msblk->next_meta_index = 0;
+        }
+        for (i = SQUASHFS_META_SLOTS; i &&
+                        msblk->meta_index[msblk->next_meta_index].locked; i--)
+                msblk->next_meta_index = (msblk->next_meta_index + 1) %
+                        SQUASHFS_META_SLOTS;
+        if (i == 0) {
+                TRACE("empty_meta_index: failed!\n");
+                goto failed;
+        }
+        TRACE("empty_meta_index: returned meta entry %d, %p\n",
+                        msblk->next_meta_index,
+                        &msblk->meta_index[msblk->next_meta_index]);
+        meta = &msblk->meta_index[msblk->next_meta_index];
+        msblk->next_meta_index = (msblk->next_meta_index + 1) %
+                        SQUASHFS_META_SLOTS;
+        meta->inode_number = inode->i_ino;
+        meta->offset = offset;
+        meta->skip = skip;
+        meta->entries = 0;
+        meta->locked = 1;
+failed:
+        mutex_unlock(&msblk->meta_index_mutex);
+        return meta;
+}
+static void release_meta_index(struct inode *inode, struct meta_index *meta)
+{
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        mutex_lock(&msblk->meta_index_mutex);
+        meta->locked = 0;
+        mutex_unlock(&msblk->meta_index_mutex);
+}
+/*
+ * Read the next n blocks from the block list, starting from
+ * metadata block <start_block, offset>.
+ */
+static long long read_indexes(struct super_block *sb, int n,
+                                u64 *start_block, int *offset)
+{
+        int err, i;
+        long long block = 0;
+        __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+        if (blist == NULL) {
+                ERROR("read_indexes: Failed to allocate block_list\n");
+                return -ENOMEM;
+        }
+        while (n) {
+                int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
+                err = squashfs_read_metadata(sb, blist, start_block,
+                                offset, blocks << 2);
+                if (err < 0) {
+                        ERROR("read_indexes: reading block [%llx:%x]\n",
+                                *start_block, *offset);
+                        goto failure;
+                }
+                for (i = 0; i < blocks; i++) {
+                        int size = le32_to_cpu(blist[i]);
+                        block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size);
+                }
+                n -= blocks;
+        }
+        kfree(blist);
+        return block;
+failure:
+        kfree(blist);
+        return err;
+}
+/*
+ * Each cache index slot has SQUASHFS_META_ENTRIES, each of which
+ * can cache one index -> datablock/blocklist-block mapping.  We wish
+ * to distribute these over the length of the file, entry[0] maps index x,
+ * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on.
+ * The larger the file, the greater the skip factor.  The skip factor is
+ * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure
+ * the number of metadata blocks that need to be read fits into the cache.
+ * If the skip factor is limited in this way then the file will use multiple
+ * slots.
+ */
+static inline int calculate_skip(int blocks)
+{
+        int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
+                 * SQUASHFS_META_INDEXES);
+        return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
+}
+/*
+ * Search and grow the index cache for the specified inode, returning the
+ * on-disk locations of the datablock and block list metadata block
+ * <index_block, index_offset> for index (scaled to nearest cache index).
+ */
+static int fill_meta_index(struct inode *inode, int index,
+                u64 *index_block, int *index_offset, u64 *data_block)
+{
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        int skip = calculate_skip(i_size_read(inode) >> msblk->block_log);
+        int offset = 0;
+        struct meta_index *meta;
+        struct meta_entry *meta_entry;
+        u64 cur_index_block = squashfs_i(inode)->block_list_start;
+        int cur_offset = squashfs_i(inode)->offset;
+        u64 cur_data_block = squashfs_i(inode)->start;
+        int err, i;
+        /*
+         * Scale index to cache index (cache slot entry)
+         */
+        index /= SQUASHFS_META_INDEXES * skip;
+        while (offset < index) {
+                meta = locate_meta_index(inode, offset + 1, index);
+                if (meta == NULL) {
+                        meta = empty_meta_index(inode, offset + 1, skip);
+                        if (meta == NULL)
+                                goto all_done;
+                } else {
+                        offset = index < meta->offset + meta->entries ? index :
+                                meta->offset + meta->entries - 1;
+                        meta_entry = &meta->meta_entry[offset - meta->offset];
+                        cur_index_block = meta_entry->index_block +
+                                msblk->inode_table;
+                        cur_offset = meta_entry->offset;
+                        cur_data_block = meta_entry->data_block;
+                        TRACE("get_meta_index: offset %d, meta->offset %d, "
+                                "meta->entries %d\n", offset, meta->offset,
+                                meta->entries);
+                        TRACE("get_meta_index: index_block 0x%llx, offset 0x%x"
+                                " data_block 0x%llx\n", cur_index_block,
+                                cur_offset, cur_data_block);
+                }
+                /*
+                 * If necessary grow cache slot by reading block list.  Cache
+                 * slot is extended up to index or to the end of the slot, in
+                 * which case further slots will be used.
+                 */
+                for (i = meta->offset + meta->entries; i <= index &&
+                                i < meta->offset + SQUASHFS_META_ENTRIES; i++) {
+                        int blocks = skip * SQUASHFS_META_INDEXES;
+                        long long res = read_indexes(inode->i_sb, blocks,
+                                        &cur_index_block, &cur_offset);
+                        if (res < 0) {
+                                if (meta->entries == 0)
+                                        /*
+                                         * Don't leave an empty slot on read
+                                         * error allocated to this inode...
+                                         */
+                                        meta->inode_number = 0;
+                                err = res;
+                                goto failed;
+                        }
+                        cur_data_block += res;
+                        meta_entry = &meta->meta_entry[i - meta->offset];
+                        meta_entry->index_block = cur_index_block -
+                                msblk->inode_table;
+                        meta_entry->offset = cur_offset;
+                        meta_entry->data_block = cur_data_block;
+                        meta->entries++;
+                        offset++;
+                }
+                TRACE("get_meta_index: meta->offset %d, meta->entries %d\n",
+                                meta->offset, meta->entries);
+                release_meta_index(inode, meta);
+        }
+all_done:
+        *index_block = cur_index_block;
+        *index_offset = cur_offset;
+        *data_block = cur_data_block;
+        /*
+         * Scale cache index (cache slot entry) to index
+         */
+        return offset * SQUASHFS_META_INDEXES * skip;
+failed:
+        release_meta_index(inode, meta);
+        return err;
+}
+/*
+ * Get the on-disk location and compressed size of the datablock
+ * specified by index.  Fill_meta_index() does most of the work.
+ */
+static int read_blocklist(struct inode *inode, int index, u64 *block)
+{
+        u64 start;
+        long long blks;
+        int offset;
+        __le32 size;
+        int res = fill_meta_index(inode, index, &start, &offset, block);
+        TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
+                       " 0x%x, block 0x%llx\n", res, index, start, offset,
+                        *block);
+        if (res < 0)
+                return res;
+        /*
+         * res contains the index of the mapping returned by fill_meta_index(),
+         * this will likely be less than the desired index (because the
+         * meta_index cache works at a higher granularity).  Read any
+         * extra block indexes needed.
+         */
+        if (res < index) {
+                blks = read_indexes(inode->i_sb, index - res, &start, &offset);
+                if (blks < 0)
+                        return (int) blks;
+                *block += blks;
+        }
+        /*
+         * Read length of block specified by index.
+         */
+        res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
+                        sizeof(size));
+        if (res < 0)
+                return res;
+        return le32_to_cpu(size);
+}
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+        int bytes, i, offset = 0, sparse = 0;
+        struct squashfs_cache_entry *buffer = NULL;
+        void *pageaddr;
+        int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+        int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+        int start_index = page->index & ~mask;
+        int end_index = start_index | mask;
+        int file_end = i_size_read(inode) >> msblk->block_log;
+        TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+                                page->index, squashfs_i(inode)->start);
+        if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                                        PAGE_CACHE_SHIFT))
+                goto out;
+        if (index < file_end || squashfs_i(inode)->fragment_block ==
+                                        SQUASHFS_INVALID_BLK) {
+                /*
+                 * Reading a datablock from disk.  Need to read block list
+                 * to get location and block size.
+                 */
+                u64 block = 0;
+                int bsize = read_blocklist(inode, index, &block);
+                if (bsize < 0)
+                        goto error_out;
+                if (bsize == 0) { /* hole */
+                        bytes = index == file_end ?
+                                (i_size_read(inode) & (msblk->block_size - 1)) :
+                                 msblk->block_size;
+                        sparse = 1;
+                } else {
+                        /*
+                         * Read and decompress datablock.
+                         */
+                        buffer = squashfs_get_datablock(inode->i_sb,
+                                                                block, bsize);
+                        if (buffer->error) {
+                                ERROR("Unable to read page, block %llx, size %x"
+                                        "\n", block, bsize);
+                                squashfs_cache_put(buffer);
+                                goto error_out;
+                        }
+                        bytes = buffer->length;
+                }
+        } else {
+                /*
+                 * Datablock is stored inside a fragment (tail-end packed
+                 * block).
+                 */
+                buffer = squashfs_get_fragment(inode->i_sb,
+                                squashfs_i(inode)->fragment_block,
+                                squashfs_i(inode)->fragment_size);
+                if (buffer->error) {
+                        ERROR("Unable to read page, block %llx, size %x\n",
+                                squashfs_i(inode)->fragment_block,
+                                squashfs_i(inode)->fragment_size);
+                        squashfs_cache_put(buffer);
+                        goto error_out;
+                }
+                bytes = i_size_read(inode) & (msblk->block_size - 1);
+                offset = squashfs_i(inode)->fragment_offset;
+        }
+        /*
+         * Loop copying datablock into pages.  As the datablock likely covers
+         * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
+         * grab the pages from the page cache, except for the page that we've
+         * been called to fill.
+         */
+        for (i = start_index; i <= end_index && bytes > 0; i++,
+                        bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+                struct page *push_page;
+                int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
+                TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
+                push_page = (i == page->index) ? page :
+                        grab_cache_page_nowait(page->mapping, i);
+                if (!push_page)
+                        continue;
+                if (PageUptodate(push_page))
+                        goto skip_page;
+                pageaddr = kmap_atomic(push_page, KM_USER0);
+                squashfs_copy_data(pageaddr, buffer, offset, avail);
+                memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+                kunmap_atomic(pageaddr, KM_USER0);
+                flush_dcache_page(push_page);
+                SetPageUptodate(push_page);
+skip_page:
+                unlock_page(push_page);
+                if (i != page->index)
+                        page_cache_release(push_page);
+        }
+        if (!sparse)
+                squashfs_cache_put(buffer);
+        return 0;
+error_out:
+        SetPageError(page);
+out:
+        pageaddr = kmap_atomic(page, KM_USER0);
+        memset(pageaddr, 0, PAGE_CACHE_SIZE);
+        kunmap_atomic(pageaddr, KM_USER0);
+        flush_dcache_page(page);
+        if (!PageError(page))
+                SetPageUptodate(page);
+        unlock_page(page);
+        return 0;
+}
+const struct address_space_operations squashfs_aops = {
+        .readpage = squashfs_readpage
+};
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
new file mode 100644
index 000000000000..b5a2c15bbbc7
--- /dev/null
+++ b/fs/squashfs/fragment.c
@@ -0,0 +1,98 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * fragment.c
+ */
+/*
+ * This file implements code to handle compressed fragments (tail-end packed
+ * datablocks).
+ *
+ * Regular files contain a fragment index which is mapped to a fragment
+ * location on disk and compressed size using a fragment lookup table.
+ * Like everything in Squashfs this fragment lookup table is itself stored
+ * compressed into metadata blocks.  A second index table is used to locate
+ * these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Look-up fragment using the fragment index table.  Return the on disk
+ * location of the fragment and its compressed size
+ */
+int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
+                                u64 *fragment_block)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int block = SQUASHFS_FRAGMENT_INDEX(fragment);
+        int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment);
+        u64 start_block = le64_to_cpu(msblk->fragment_index[block]);
+        struct squashfs_fragment_entry fragment_entry;
+        int size;
+        size = squashfs_read_metadata(sb, &fragment_entry, &start_block,
+                                        &offset, sizeof(fragment_entry));
+        if (size < 0)
+                return size;
+        *fragment_block = le64_to_cpu(fragment_entry.start_block);
+        size = le32_to_cpu(fragment_entry.size);
+        return size;
+}
+/*
+ * Read the uncompressed fragment lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_fragment_index_table(struct super_block *sb,
+        u64 fragment_table_start, unsigned int fragments)
+{
+        unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
+        __le64 *fragment_index;
+        int err;
+        /* Allocate fragment lookup table indexes */
+        fragment_index = kmalloc(length, GFP_KERNEL);
+        if (fragment_index == NULL) {
+                ERROR("Failed to allocate fragment index table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, fragment_index, fragment_table_start,
+                        length);
+        if (err < 0) {
+                ERROR("unable to read fragment index table\n");
+                kfree(fragment_index);
+                return ERR_PTR(err);
+        }
+        return fragment_index;
+}
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
new file mode 100644
index 000000000000..3795b837ba28
--- /dev/null
+++ b/fs/squashfs/id.c
@@ -0,0 +1,94 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * id.c
+ */
+/*
+ * This file implements code to handle uids and gids.
+ *
+ * For space efficiency regular files store uid and gid indexes, which are
+ * converted to 32-bit uids/gids using an id look up table.  This table is
+ * stored compressed into metadata blocks.  A second index table is used to
+ * locate these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Map uid/gid index into real 32-bit uid/gid using the id look up table
+ */
+int squashfs_get_id(struct super_block *sb, unsigned int index,
+                                        unsigned int *id)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int block = SQUASHFS_ID_BLOCK(index);
+        int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
+        u64 start_block = le64_to_cpu(msblk->id_table[block]);
+        __le32 disk_id;
+        int err;
+        err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
+                                                        sizeof(disk_id));
+        if (err < 0)
+                return err;
+        *id = le32_to_cpu(disk_id);
+        return 0;
+}
+/*
+ * Read uncompressed id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_id_index_table(struct super_block *sb,
+                        u64 id_table_start, unsigned short no_ids)
+{
+        unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
+        __le64 *id_table;
+        int err;
+        TRACE("In read_id_index_table, length %d\n", length);
+        /* Allocate id lookup table indexes */
+        id_table = kmalloc(length, GFP_KERNEL);
+        if (id_table == NULL) {
+                ERROR("Failed to allocate id index table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, id_table, id_table_start, length);
+        if (err < 0) {
+                ERROR("unable to read id index table\n");
+                kfree(id_table);
+                return ERR_PTR(err);
+        }
+        return id_table;
+}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
new file mode 100644
index 000000000000..7a63398bb855
--- /dev/null
+++ b/fs/squashfs/inode.c
@@ -0,0 +1,346 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * inode.c
+ */
+/*
+ * This file implements code to create and read inodes from disk.
+ *
+ * Inodes in Squashfs are identified by a 48-bit inode which encodes the
+ * location of the compressed metadata block containing the inode, and the byte
+ * offset into that block where the inode is placed (<block, offset>).
+ *
+ * To maximise compression there are different inodes for each file type
+ * (regular file, directory, device, etc.), the inode contents and length
+ * varying with the type.
+ *
+ * To further maximise compression, two types of regular file inode and
+ * directory inode are defined: inodes optimised for frequently occurring
+ * regular files and directories, and extended types where extra
+ * information has to be stored.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Initialise VFS inode with the base inode information common to all
+ * Squashfs inode types.  Sqsh_ino contains the unswapped base inode
+ * off disk.
+ */
+static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
+                                struct squashfs_base_inode *sqsh_ino)
+{
+        int err;
+        err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid);
+        if (err)
+                return err;
+        err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid);
+        if (err)
+                return err;
+        inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
+        inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
+        inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
+        inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
+        inode->i_mode = le16_to_cpu(sqsh_ino->mode);
+        inode->i_size = 0;
+        return err;
+}
+struct inode *squashfs_iget(struct super_block *sb, long long ino,
+                                unsigned int ino_number)
+{
+        struct inode *inode = iget_locked(sb, ino_number);
+        int err;
+        TRACE("Entered squashfs_iget\n");
+        if (!inode)
+                return ERR_PTR(-ENOMEM);
+        if (!(inode->i_state & I_NEW))
+                return inode;
+        err = squashfs_read_inode(inode, ino);
+        if (err) {
+                iget_failed(inode);
+                return ERR_PTR(err);
+        }
+        unlock_new_inode(inode);
+        return inode;
+}
+/*
+ * Initialise VFS inode by reading inode from inode table (compressed
+ * metadata).  The format and amount of data read depends on type.
+ */
+int squashfs_read_inode(struct inode *inode, long long ino)
+{
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+        int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
+        union squashfs_inode squashfs_ino;
+        struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+        TRACE("Entered squashfs_read_inode\n");
+        /*
+         * Read inode base common to all inode types.
+         */
+        err = squashfs_read_metadata(sb, sqshb_ino, &block,
+                                &offset, sizeof(*sqshb_ino));
+        if (err < 0)
+                goto failed_read;
+        err = squashfs_new_inode(sb, inode, sqshb_ino);
+        if (err)
+                goto failed_read;
+        block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+        offset = SQUASHFS_INODE_OFFSET(ino);
+        type = le16_to_cpu(sqshb_ino->inode_type);
+        switch (type) {
+        case SQUASHFS_REG_TYPE: {
+                unsigned int frag_offset, frag_size, frag;
+                u64 frag_blk;
+                struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                                        sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                frag = le32_to_cpu(sqsh_ino->fragment);
+                if (frag != SQUASHFS_INVALID_FRAG) {
+                        frag_offset = le32_to_cpu(sqsh_ino->offset);
+                        frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+                        if (frag_size < 0) {
+                                err = frag_size;
+                                goto failed_read;
+                        }
+                } else {
+                        frag_blk = SQUASHFS_INVALID_BLK;
+                        frag_size = 0;
+                        frag_offset = 0;
+                }
+                inode->i_nlink = 1;
+                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+                inode->i_fop = &generic_ro_fops;
+                inode->i_mode |= S_IFREG;
+                inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
+                squashfs_i(inode)->fragment_block = frag_blk;
+                squashfs_i(inode)->fragment_size = frag_size;
+                squashfs_i(inode)->fragment_offset = frag_offset;
+                squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+                squashfs_i(inode)->block_list_start = block;
+                squashfs_i(inode)->offset = offset;
+                inode->i_data.a_ops = &squashfs_aops;
+                TRACE("File inode %x:%x, start_block %llx, block_list_start "
+                        "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+                        offset, squashfs_i(inode)->start, block, offset);
+                break;
+        }
+        case SQUASHFS_LREG_TYPE: {
+                unsigned int frag_offset, frag_size, frag;
+                u64 frag_blk;
+                struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                                        sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                frag = le32_to_cpu(sqsh_ino->fragment);
+                if (frag != SQUASHFS_INVALID_FRAG) {
+                        frag_offset = le32_to_cpu(sqsh_ino->offset);
+                        frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+                        if (frag_size < 0) {
+                                err = frag_size;
+                                goto failed_read;
+                        }
+                } else {
+                        frag_blk = SQUASHFS_INVALID_BLK;
+                        frag_size = 0;
+                        frag_offset = 0;
+                }
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+                inode->i_fop = &generic_ro_fops;
+                inode->i_mode |= S_IFREG;
+                inode->i_blocks = ((inode->i_size -
+                                le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+                squashfs_i(inode)->fragment_block = frag_blk;
+                squashfs_i(inode)->fragment_size = frag_size;
+                squashfs_i(inode)->fragment_offset = frag_offset;
+                squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
+                squashfs_i(inode)->block_list_start = block;
+                squashfs_i(inode)->offset = offset;
+                inode->i_data.a_ops = &squashfs_aops;
+                TRACE("File inode %x:%x, start_block %llx, block_list_start "
+                        "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+                        offset, squashfs_i(inode)->start, block, offset);
+                break;
+        }
+        case SQUASHFS_DIR_TYPE: {
+                struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                inode->i_size = le16_to_cpu(sqsh_ino->file_size);
+                inode->i_op = &squashfs_dir_inode_ops;
+                inode->i_fop = &squashfs_dir_ops;
+                inode->i_mode |= S_IFDIR;
+                squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+                squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+                squashfs_i(inode)->dir_idx_cnt = 0;
+                squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+                TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
+                                SQUASHFS_INODE_BLK(ino), offset,
+                                squashfs_i(inode)->start,
+                                le16_to_cpu(sqsh_ino->offset));
+                break;
+        }
+        case SQUASHFS_LDIR_TYPE: {
+                struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+                inode->i_op = &squashfs_dir_inode_ops;
+                inode->i_fop = &squashfs_dir_ops;
+                inode->i_mode |= S_IFDIR;
+                squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+                squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+                squashfs_i(inode)->dir_idx_start = block;
+                squashfs_i(inode)->dir_idx_offset = offset;
+                squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
+                squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+                TRACE("Long directory inode %x:%x, start_block %llx, offset "
+                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
+                                squashfs_i(inode)->start,
+                                le16_to_cpu(sqsh_ino->offset));
+                break;
+        }
+        case SQUASHFS_SYMLINK_TYPE:
+        case SQUASHFS_LSYMLINK_TYPE: {
+                struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
+                inode->i_op = &page_symlink_inode_operations;
+                inode->i_data.a_ops = &squashfs_symlink_aops;
+                inode->i_mode |= S_IFLNK;
+                squashfs_i(inode)->start = block;
+                squashfs_i(inode)->offset = offset;
+                TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
+                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
+                                block, offset);
+                break;
+        }
+        case SQUASHFS_BLKDEV_TYPE:
+        case SQUASHFS_CHRDEV_TYPE:
+        case SQUASHFS_LBLKDEV_TYPE:
+        case SQUASHFS_LCHRDEV_TYPE: {
+                struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
+                unsigned int rdev;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_CHRDEV_TYPE)
+                        inode->i_mode |= S_IFCHR;
+                else
+                        inode->i_mode |= S_IFBLK;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                rdev = le32_to_cpu(sqsh_ino->rdev);
+                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+                TRACE("Device inode %x:%x, rdev %x\n",
+                                SQUASHFS_INODE_BLK(ino), offset, rdev);
+                break;
+        }
+        case SQUASHFS_FIFO_TYPE:
+        case SQUASHFS_SOCKET_TYPE:
+        case SQUASHFS_LFIFO_TYPE:
+        case SQUASHFS_LSOCKET_TYPE: {
+                struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_FIFO_TYPE)
+                        inode->i_mode |= S_IFIFO;
+                else
+                        inode->i_mode |= S_IFSOCK;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                init_special_inode(inode, inode->i_mode, 0);
+                break;
+        }
+        default:
+                ERROR("Unknown inode type %d in squashfs_iget!\n", type);
+                return -EINVAL;
+        }
+        return 0;
+failed_read:
+        ERROR("Unable to read inode 0x%llx\n", ino);
+        return err;
+}
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
new file mode 100644
index 000000000000..9e398653b22b
--- /dev/null
+++ b/fs/squashfs/namei.c
@@ -0,0 +1,242 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * namei.c
+ */
+/*
+ * This file implements code to do filename lookup in directories.
+ *
+ * Like inodes, directories are packed into compressed metadata blocks, stored
+ * in a directory table.  Directories are accessed using the start address of
+ * the metablock containing the directory and the offset into the
+ * decompressed block (<block, offset>).
+ *
+ * Directories are organised in a slightly complex way, and are not simply
+ * a list of file names.  The organisation takes advantage of the
+ * fact that (in most cases) the inodes of the files will be in the same
+ * compressed metadata block, and therefore, can share the start block.
+ * Directories are therefore organised in a two level list, a directory
+ * header containing the shared start block value, and a sequence of directory
+ * entries, each of which share the shared start block.  A new directory header
+ * is written once/if the inode start block changes.  The directory
+ * header/directory entry list is repeated as many times as necessary.
+ *
+ * Directories are sorted, and can contain a directory index to speed up
+ * file lookup.  Directory indexes store one entry per metablock, each entry
+ * storing the index/filename mapping to the first directory header
+ * in each metadata block.  Directories are sorted in alphabetical order,
+ * and at lookup the index is scanned linearly looking for the first filename
+ * alphabetically larger than the filename being looked up.  At this point the
+ * location of the metadata block the filename is in has been found.
+ * The general idea of the index is ensure only one metadata block needs to be
+ * decompressed to do a lookup irrespective of the length of the directory.
+ * This scheme has the advantage that it doesn't require extra memory overhead
+ * and doesn't require much extra storage on disk.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Lookup name in the directory index, returning the location of the metadata
+ * block containing it, and the directory index this represents.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_name(struct super_block *sb,
+                        u64 *next_block, int *next_offset, u64 index_start,
+                        int index_offset, int i_count, const char *name,
+                        int len)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int i, size, length = 0, err;
+        struct squashfs_dir_index *index;
+        char *str;
+        TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
+        index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
+        if (index == NULL) {
+                ERROR("Failed to allocate squashfs_dir_index\n");
+                goto out;
+        }
+        str = &index->name[SQUASHFS_NAME_LEN + 1];
+        strncpy(str, name, len);
+        str[len] = '\0';
+        for (i = 0; i < i_count; i++) {
+                err = squashfs_read_metadata(sb, index, &index_start,
+                                        &index_offset, sizeof(*index));
+                if (err < 0)
+                        break;
+                size = le32_to_cpu(index->size) + 1;
+                err = squashfs_read_metadata(sb, index->name, &index_start,
+                                        &index_offset, size);
+                if (err < 0)
+                        break;
+                index->name[size] = '\0';
+                if (strcmp(index->name, str) > 0)
+                        break;
+                length = le32_to_cpu(index->index);
+                *next_block = le32_to_cpu(index->start_block) +
+                                        msblk->directory_table;
+        }
+        *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+        kfree(index);
+out:
+        /*
+         * Return index (f_pos) of the looked up metadata block.  Translate
+         * from internal f_pos to external f_pos which is offset by 3 because
+         * we invent "." and ".." entries which are not actually stored in the
+         * directory.
+         */
+        return length + 3;
+}
+static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
+                                 struct nameidata *nd)
+{
+        const unsigned char *name = dentry->d_name.name;
+        int len = dentry->d_name.len;
+        struct inode *inode = NULL;
+        struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info;
+        struct squashfs_dir_header dirh;
+        struct squashfs_dir_entry *dire;
+        u64 block = squashfs_i(dir)->start + msblk->directory_table;
+        int offset = squashfs_i(dir)->offset;
+        int err, length = 0, dir_count, size;
+        TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
+        dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+        if (dire == NULL) {
+                ERROR("Failed to allocate squashfs_dir_entry\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        if (len > SQUASHFS_NAME_LEN) {
+                err = -ENAMETOOLONG;
+                goto failed;
+        }
+        length = get_dir_index_using_name(dir->i_sb, &block, &offset,
+                                squashfs_i(dir)->dir_idx_start,
+                                squashfs_i(dir)->dir_idx_offset,
+                                squashfs_i(dir)->dir_idx_cnt, name, len);
+        while (length < i_size_read(dir)) {
+                /*
+                 * Read directory header.
+                 */
+                err = squashfs_read_metadata(dir->i_sb, &dirh, &block,
+                                &offset, sizeof(dirh));
+                if (err < 0)
+                        goto read_failure;
+                length += sizeof(dirh);
+                dir_count = le32_to_cpu(dirh.count) + 1;
+                while (dir_count--) {
+                        /*
+                         * Read directory entry.
+                         */
+                        err = squashfs_read_metadata(dir->i_sb, dire, &block,
+                                        &offset, sizeof(*dire));
+                        if (err < 0)
+                                goto read_failure;
+                        size = le16_to_cpu(dire->size) + 1;
+                        err = squashfs_read_metadata(dir->i_sb, dire->name,
+                                        &block, &offset, size);
+                        if (err < 0)
+                                goto read_failure;
+                        length += sizeof(*dire) + size;
+                        if (name[0] < dire->name[0])
+                                goto exit_lookup;
+                        if (len == size && !strncmp(name, dire->name, len)) {
+                                unsigned int blk, off, ino_num;
+                                long long ino;
+                                blk = le32_to_cpu(dirh.start_block);
+                                off = le16_to_cpu(dire->offset);
+                                ino_num = le32_to_cpu(dirh.inode_number) +
+                                        (short) le16_to_cpu(dire->inode_number);
+                                ino = SQUASHFS_MKINODE(blk, off);
+                                TRACE("calling squashfs_iget for directory "
+                                        "entry %s, inode  %x:%x, %d\n", name,
+                                        blk, off, ino_num);
+                                inode = squashfs_iget(dir->i_sb, ino, ino_num);
+                                if (IS_ERR(inode)) {
+                                        err = PTR_ERR(inode);
+                                        goto failed;
+                                }
+                                goto exit_lookup;
+                        }
+                }
+        }
+exit_lookup:
+        kfree(dire);
+        if (inode)
+                return d_splice_alias(inode, dentry);
+        d_add(dentry, inode);
+        return ERR_PTR(0);
+read_failure:
+        ERROR("Unable to read directory block [%llx:%x]\n",
+                squashfs_i(dir)->start + msblk->directory_table,
+                squashfs_i(dir)->offset);
+failed:
+        kfree(dire);
+        return ERR_PTR(err);
+}
+const struct inode_operations squashfs_dir_inode_ops = {
+        .lookup = squashfs_lookup
+};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
new file mode 100644
index 000000000000..6b2515d027d5
--- /dev/null
+++ b/fs/squashfs/squashfs.h
@@ -0,0 +1,90 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs.h
+ */
+#define TRACE(s, args...)       pr_debug("SQUASHFS: "s, ## args)
+#define ERROR(s, args...)       pr_err("SQUASHFS error: "s, ## args)
+#define WARNING(s, args...)     pr_warning("SQUASHFS: "s, ## args)
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+        return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
+/* block.c */
+extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
+                                int);
+/* cache.c */
+extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
+extern void squashfs_cache_delete(struct squashfs_cache *);
+extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *,
+                                struct squashfs_cache *, u64, int);
+extern void squashfs_cache_put(struct squashfs_cache_entry *);
+extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int);
+extern int squashfs_read_metadata(struct super_block *, void *, u64 *,
+                                int *, int);
+extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
+                                u64, int);
+extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
+                                u64, int);
+extern int squashfs_read_table(struct super_block *, void *, u64, int);
+/* export.c */
+extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
+                                unsigned int);
+/* fragment.c */
+extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
+extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
+                                u64, unsigned int);
+/* id.c */
+extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
+extern __le64 *squashfs_read_id_index_table(struct super_block *, u64,
+                                unsigned short);
+/* inode.c */
+extern struct inode *squashfs_iget(struct super_block *, long long,
+                                unsigned int);
+extern int squashfs_read_inode(struct inode *, long long);
+/*
+ * Inodes and files operations
+ */
+/* dir.c */
+extern const struct file_operations squashfs_dir_ops;
+/* export.c */
+extern const struct export_operations squashfs_export_ops;
+/* file.c */
+extern const struct address_space_operations squashfs_aops;
+/* namei.c */
+extern const struct inode_operations squashfs_dir_inode_ops;
+/* symlink.c */
+extern const struct address_space_operations squashfs_symlink_aops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
new file mode 100644
index 000000000000..6840da1bf21e
--- /dev/null
+++ b/fs/squashfs/squashfs_fs.h
@@ -0,0 +1,381 @@
+#ifndef SQUASHFS_FS
+#define SQUASHFS_FS
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs.h
+ */
+#define SQUASHFS_CACHED_FRAGMENTS       CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
+#define SQUASHFS_MAJOR                  4
+#define SQUASHFS_MINOR                  0
+#define SQUASHFS_MAGIC                  0x73717368
+#define SQUASHFS_START                  0
+/* size of metadata (inode and directory) blocks */
+#define SQUASHFS_METADATA_SIZE          8192
+#define SQUASHFS_METADATA_LOG           13
+/* default size of data blocks */
+#define SQUASHFS_FILE_SIZE              131072
+#define SQUASHFS_FILE_LOG               17
+#define SQUASHFS_FILE_MAX_SIZE          1048576
+#define SQUASHFS_FILE_MAX_LOG           20
+/* Max number of uids and gids */
+#define SQUASHFS_IDS                    65536
+/* Max length of filename (not 255) */
+#define SQUASHFS_NAME_LEN               256
+#define SQUASHFS_INVALID_FRAG           (0xffffffffU)
+#define SQUASHFS_INVALID_BLK            (-1LL)
+/* Filesystem flags */
+#define SQUASHFS_NOI                    0
+#define SQUASHFS_NOD                    1
+#define SQUASHFS_NOF                    3
+#define SQUASHFS_NO_FRAG                4
+#define SQUASHFS_ALWAYS_FRAG            5
+#define SQUASHFS_DUPLICATE              6
+#define SQUASHFS_EXPORT                 7
+#define SQUASHFS_BIT(flag, bit)         ((flag >> bit) & 1)
+#define SQUASHFS_UNCOMPRESSED_INODES(flags)     SQUASHFS_BIT(flags, \
+                                                SQUASHFS_NOI)
+#define SQUASHFS_UNCOMPRESSED_DATA(flags)       SQUASHFS_BIT(flags, \
+                                                SQUASHFS_NOD)
+#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags)  SQUASHFS_BIT(flags, \
+                                                SQUASHFS_NOF)
+#define SQUASHFS_NO_FRAGMENTS(flags)            SQUASHFS_BIT(flags, \
+                                                SQUASHFS_NO_FRAG)
+#define SQUASHFS_ALWAYS_FRAGMENTS(flags)        SQUASHFS_BIT(flags, \
+                                                SQUASHFS_ALWAYS_FRAG)
+#define SQUASHFS_DUPLICATES(flags)              SQUASHFS_BIT(flags, \
+                                                SQUASHFS_DUPLICATE)
+#define SQUASHFS_EXPORTABLE(flags)              SQUASHFS_BIT(flags, \
+                                                SQUASHFS_EXPORT)
+/* Max number of types and file types */
+#define SQUASHFS_DIR_TYPE               1
+#define SQUASHFS_REG_TYPE               2
+#define SQUASHFS_SYMLINK_TYPE           3
+#define SQUASHFS_BLKDEV_TYPE            4
+#define SQUASHFS_CHRDEV_TYPE            5
+#define SQUASHFS_FIFO_TYPE              6
+#define SQUASHFS_SOCKET_TYPE            7
+#define SQUASHFS_LDIR_TYPE              8
+#define SQUASHFS_LREG_TYPE              9
+#define SQUASHFS_LSYMLINK_TYPE          10
+#define SQUASHFS_LBLKDEV_TYPE           11
+#define SQUASHFS_LCHRDEV_TYPE           12
+#define SQUASHFS_LFIFO_TYPE             13
+#define SQUASHFS_LSOCKET_TYPE           14
+/* Flag whether block is compressed or uncompressed, bit is set if block is
+ * uncompressed */
+#define SQUASHFS_COMPRESSED_BIT         (1 << 15)
+#define SQUASHFS_COMPRESSED_SIZE(B)     (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \
+                (B) & ~SQUASHFS_COMPRESSED_BIT :  SQUASHFS_COMPRESSED_BIT)
+#define SQUASHFS_COMPRESSED(B)          (!((B) & SQUASHFS_COMPRESSED_BIT))
+#define SQUASHFS_COMPRESSED_BIT_BLOCK   (1 << 24)
+#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B)       ((B) & \
+                                                ~SQUASHFS_COMPRESSED_BIT_BLOCK)
+#define SQUASHFS_COMPRESSED_BLOCK(B)    (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK))
+/*
+ * Inode number ops.  Inodes consist of a compressed block number, and an
+ * uncompressed offset within that block
+ */
+#define SQUASHFS_INODE_BLK(A)           ((unsigned int) ((A) >> 16))
+#define SQUASHFS_INODE_OFFSET(A)        ((unsigned int) ((A) & 0xffff))
+#define SQUASHFS_MKINODE(A, B)          ((long long)(((long long) (A)\
+                                        << 16) + (B)))
+/* Translate between VFS mode and squashfs mode */
+#define SQUASHFS_MODE(A)                ((A) & 0xfff)
+/* fragment and fragment table defines */
+#define SQUASHFS_FRAGMENT_BYTES(A)      \
+                                ((A) * sizeof(struct squashfs_fragment_entry))
+#define SQUASHFS_FRAGMENT_INDEX(A)      (SQUASHFS_FRAGMENT_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A)       (SQUASHFS_FRAGMENT_BYTES(A) % \
+                                                SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_FRAGMENT_INDEXES(A)    ((SQUASHFS_FRAGMENT_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_FRAGMENT_INDEX_BYTES(A)        (SQUASHFS_FRAGMENT_INDEXES(A) *\
+                                                sizeof(u64))
+/* inode lookup table defines */
+#define SQUASHFS_LOOKUP_BYTES(A)        ((A) * sizeof(u64))
+#define SQUASHFS_LOOKUP_BLOCK(A)        (SQUASHFS_LOOKUP_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_LOOKUP_BLOCKS(A)       ((SQUASHFS_LOOKUP_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_LOOKUP_BLOCK_BYTES(A)  (SQUASHFS_LOOKUP_BLOCKS(A) *\
+                                        sizeof(u64))
+/* uid/gid lookup table defines */
+#define SQUASHFS_ID_BYTES(A)            ((A) * sizeof(unsigned int))
+#define SQUASHFS_ID_BLOCK(A)            (SQUASHFS_ID_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_ID_BLOCK_OFFSET(A)     (SQUASHFS_ID_BYTES(A) % \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_ID_BLOCKS(A)           ((SQUASHFS_ID_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_ID_BLOCK_BYTES(A)      (SQUASHFS_ID_BLOCKS(A) *\
+                                        sizeof(u64))
+/* cached data constants for filesystem */
+#define SQUASHFS_CACHED_BLKS            8
+#define SQUASHFS_MAX_FILE_SIZE_LOG      64
+#define SQUASHFS_MAX_FILE_SIZE          (1LL << \
+                                        (SQUASHFS_MAX_FILE_SIZE_LOG - 2))
+#define SQUASHFS_MARKER_BYTE            0xff
+/* meta index cache */
+#define SQUASHFS_META_INDEXES   (SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
+#define SQUASHFS_META_ENTRIES   127
+#define SQUASHFS_META_SLOTS     8
+struct meta_entry {
+        u64                     data_block;
+        unsigned int            index_block;
+        unsigned short          offset;
+        unsigned short          pad;
+};
+struct meta_index {
+        unsigned int            inode_number;
+        unsigned int            offset;
+        unsigned short          entries;
+        unsigned short          skip;
+        unsigned short          locked;
+        unsigned short          pad;
+        struct meta_entry       meta_entry[SQUASHFS_META_ENTRIES];
+};
+/*
+ * definitions for structures on disk
+ */
+#define ZLIB_COMPRESSION         1
+struct squashfs_super_block {
+        __le32                  s_magic;
+        __le32                  inodes;
+        __le32                  mkfs_time;
+        __le32                  block_size;
+        __le32                  fragments;
+        __le16                  compression;
+        __le16                  block_log;
+        __le16                  flags;
+        __le16                  no_ids;
+        __le16                  s_major;
+        __le16                  s_minor;
+        __le64                  root_inode;
+        __le64                  bytes_used;
+        __le64                  id_table_start;
+        __le64                  xattr_table_start;
+        __le64                  inode_table_start;
+        __le64                  directory_table_start;
+        __le64                  fragment_table_start;
+        __le64                  lookup_table_start;
+};
+struct squashfs_dir_index {
+        __le32                  index;
+        __le32                  start_block;
+        __le32                  size;
+        unsigned char           name[0];
+};
+struct squashfs_base_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+};
+struct squashfs_ipc_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+};
+struct squashfs_dev_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  rdev;
+};
+struct squashfs_symlink_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  symlink_size;
+        char                    symlink[0];
+};
+struct squashfs_reg_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  start_block;
+        __le32                  fragment;
+        __le32                  offset;
+        __le32                  file_size;
+        __le16                  block_list[0];
+};
+struct squashfs_lreg_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le64                  start_block;
+        __le64                  file_size;
+        __le64                  sparse;
+        __le32                  nlink;
+        __le32                  fragment;
+        __le32                  offset;
+        __le32                  xattr;
+        __le16                  block_list[0];
+};
+struct squashfs_dir_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  start_block;
+        __le32                  nlink;
+        __le16                  file_size;
+        __le16                  offset;
+        __le32                  parent_inode;
+};
+struct squashfs_ldir_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  file_size;
+        __le32                  start_block;
+        __le32                  parent_inode;
+        __le16                  i_count;
+        __le16                  offset;
+        __le32                  xattr;
+        struct squashfs_dir_index       index[0];
+};
+union squashfs_inode {
+        struct squashfs_base_inode              base;
+        struct squashfs_dev_inode               dev;
+        struct squashfs_symlink_inode           symlink;
+        struct squashfs_reg_inode               reg;
+        struct squashfs_lreg_inode              lreg;
+        struct squashfs_dir_inode               dir;
+        struct squashfs_ldir_inode              ldir;
+        struct squashfs_ipc_inode               ipc;
+};
+struct squashfs_dir_entry {
+        __le16                  offset;
+        __le16                  inode_number;
+        __le16                  type;
+        __le16                  size;
+        char                    name[0];
+};
+struct squashfs_dir_header {
+        __le32                  count;
+        __le32                  start_block;
+        __le32                  inode_number;
+};
+struct squashfs_fragment_entry {
+        __le64                  start_block;
+        __le32                  size;
+        unsigned int            unused;
+};
+#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
new file mode 100644
index 000000000000..fbfca30c0c68
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -0,0 +1,45 @@
+#ifndef SQUASHFS_FS_I
+#define SQUASHFS_FS_I
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_i.h
+ */
+struct squashfs_inode_info {
+        u64             start;
+        int             offset;
+        union {
+                struct {
+                        u64             fragment_block;
+                        int             fragment_size;
+                        int             fragment_offset;
+                        u64             block_list_start;
+                };
+                struct {
+                        u64             dir_idx_start;
+                        int             dir_idx_offset;
+                        int             dir_idx_cnt;
+                        int             parent;
+                };
+        };
+        struct inode    vfs_inode;
+};
+#endif
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
new file mode 100644
index 000000000000..c8c65614dd1c
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -0,0 +1,76 @@
+#ifndef SQUASHFS_FS_SB
+#define SQUASHFS_FS_SB
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_sb.h
+ */
+#include "squashfs_fs.h"
+struct squashfs_cache {
+        char                    *name;
+        int                     entries;
+        int                     next_blk;
+        int                     num_waiters;
+        int                     unused;
+        int                     block_size;
+        int                     pages;
+        spinlock_t              lock;
+        wait_queue_head_t       wait_queue;
+        struct squashfs_cache_entry *entry;
+};
+struct squashfs_cache_entry {
+        u64                     block;
+        int                     length;
+        int                     refcount;
+        u64                     next_index;
+        int                     pending;
+        int                     error;
+        int                     num_waiters;
+        wait_queue_head_t       wait_queue;
+        struct squashfs_cache   *cache;
+        void                    **data;
+};
+struct squashfs_sb_info {
+        int                     devblksize;
+        int                     devblksize_log2;
+        struct squashfs_cache   *block_cache;
+        struct squashfs_cache   *fragment_cache;
+        struct squashfs_cache   *read_page;
+        int                     next_meta_index;
+        __le64                  *id_table;
+        __le64                  *fragment_index;
+        unsigned int            *fragment_index_2;
+        struct mutex            read_data_mutex;
+        struct mutex            meta_index_mutex;
+        struct meta_index       *meta_index;
+        z_stream                stream;
+        __le64                  *inode_lookup_table;
+        u64                     inode_table;
+        u64                     directory_table;
+        unsigned int            block_size;
+        unsigned short          block_log;
+        long long               bytes_used;
+        unsigned int            inodes;
+};
+#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
new file mode 100644
index 000000000000..a0466d7467b2
--- /dev/null
+++ b/fs/squashfs/super.c
@@ -0,0 +1,440 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * super.c
+ */
+/*
+ * This file implements code to read the superblock, read and initialise
+ * in-memory structures at mount time, and all the VFS glue code to register
+ * the filesystem.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static struct file_system_type squashfs_fs_type;
+static struct super_operations squashfs_super_ops;
+static int supported_squashfs_filesystem(short major, short minor, short comp)
+{
+        if (major < SQUASHFS_MAJOR) {
+                ERROR("Major/Minor mismatch, older Squashfs %d.%d "
+                        "filesystems are unsupported\n", major, minor);
+                return -EINVAL;
+        } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
+                ERROR("Major/Minor mismatch, trying to mount newer "
+                        "%d.%d filesystem\n", major, minor);
+                ERROR("Please update your kernel\n");
+                return -EINVAL;
+        }
+        if (comp != ZLIB_COMPRESSION)
+                return -EINVAL;
+        return 0;
+}
+static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct squashfs_sb_info *msblk;
+        struct squashfs_super_block *sblk = NULL;
+        char b[BDEVNAME_SIZE];
+        struct inode *root;
+        long long root_inode;
+        unsigned short flags;
+        unsigned int fragments;
+        u64 lookup_table_start;
+        int err;
+        TRACE("Entered squashfs_fill_superblock\n");
+        sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
+        if (sb->s_fs_info == NULL) {
+                ERROR("Failed to allocate squashfs_sb_info\n");
+                return -ENOMEM;
+        }
+        msblk = sb->s_fs_info;
+        msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
+                GFP_KERNEL);
+        if (msblk->stream.workspace == NULL) {
+                ERROR("Failed to allocate zlib workspace\n");
+                goto failure;
+        }
+        sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
+        if (sblk == NULL) {
+                ERROR("Failed to allocate squashfs_super_block\n");
+                goto failure;
+        }
+        msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
+        msblk->devblksize_log2 = ffz(~msblk->devblksize);
+        mutex_init(&msblk->read_data_mutex);
+        mutex_init(&msblk->meta_index_mutex);
+        /*
+         * msblk->bytes_used is checked in squashfs_read_table to ensure reads
+         * are not beyond filesystem end.  But as we're using
+         * squashfs_read_table here to read the superblock (including the value
+         * of bytes_used) we need to set it to an initial sensible dummy value
+         */
+        msblk->bytes_used = sizeof(*sblk);
+        err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk));
+        if (err < 0) {
+                ERROR("unable to read squashfs_super_block\n");
+                goto failed_mount;
+        }
+        /* Check it is a SQUASHFS superblock */
+        sb->s_magic = le32_to_cpu(sblk->s_magic);
+        if (sb->s_magic != SQUASHFS_MAGIC) {
+                if (!silent)
+                        ERROR("Can't find a SQUASHFS superblock on %s\n",
+                                                bdevname(sb->s_bdev, b));
+                err = -EINVAL;
+                goto failed_mount;
+        }
+        /* Check the MAJOR & MINOR versions and compression type */
+        err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
+                        le16_to_cpu(sblk->s_minor),
+                        le16_to_cpu(sblk->compression));
+        if (err < 0)
+                goto failed_mount;
+        err = -EINVAL;
+        /*
+         * Check if there's xattrs in the filesystem.  These are not
+         * supported in this version, so warn that they will be ignored.
+         */
+        if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
+                ERROR("Xattrs in filesystem, these will be ignored\n");
+        /* Check the filesystem does not extend beyond the end of the
+           block device */
+        msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
+        if (msblk->bytes_used < 0 || msblk->bytes_used >
+                        i_size_read(sb->s_bdev->bd_inode))
+                goto failed_mount;
+        /* Check block size for sanity */
+        msblk->block_size = le32_to_cpu(sblk->block_size);
+        if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
+                goto failed_mount;
+        msblk->block_log = le16_to_cpu(sblk->block_log);
+        if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
+                goto failed_mount;
+        /* Check the root inode for sanity */
+        root_inode = le64_to_cpu(sblk->root_inode);
+        if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
+                goto failed_mount;
+        msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
+        msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
+        msblk->inodes = le32_to_cpu(sblk->inodes);
+        flags = le16_to_cpu(sblk->flags);
+        TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+        TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
+                                ? "un" : "");
+        TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
+                                ? "un" : "");
+        TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
+        TRACE("Block size %d\n", msblk->block_size);
+        TRACE("Number of inodes %d\n", msblk->inodes);
+        TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments));
+        TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
+        TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
+        TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
+        TRACE("sblk->fragment_table_start %llx\n",
+                (u64) le64_to_cpu(sblk->fragment_table_start));
+        TRACE("sblk->id_table_start %llx\n",
+                (u64) le64_to_cpu(sblk->id_table_start));
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_flags |= MS_RDONLY;
+        sb->s_op = &squashfs_super_ops;
+        err = -ENOMEM;
+        msblk->block_cache = squashfs_cache_init("metadata",
+                        SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
+        if (msblk->block_cache == NULL)
+                goto failed_mount;
+        /* Allocate read_page block */
+        msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
+        if (msblk->read_page == NULL) {
+                ERROR("Failed to allocate read_page block\n");
+                goto failed_mount;
+        }
+        /* Allocate and read id index table */
+        msblk->id_table = squashfs_read_id_index_table(sb,
+                le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
+        if (IS_ERR(msblk->id_table)) {
+                err = PTR_ERR(msblk->id_table);
+                msblk->id_table = NULL;
+                goto failed_mount;
+        }
+        fragments = le32_to_cpu(sblk->fragments);
+        if (fragments == 0)
+                goto allocate_lookup_table;
+        msblk->fragment_cache = squashfs_cache_init("fragment",
+                SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
+        if (msblk->fragment_cache == NULL) {
+                err = -ENOMEM;
+                goto failed_mount;
+        }
+        /* Allocate and read fragment index table */
+        msblk->fragment_index = squashfs_read_fragment_index_table(sb,
+                le64_to_cpu(sblk->fragment_table_start), fragments);
+        if (IS_ERR(msblk->fragment_index)) {
+                err = PTR_ERR(msblk->fragment_index);
+                msblk->fragment_index = NULL;
+                goto failed_mount;
+        }
+allocate_lookup_table:
+        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+        if (lookup_table_start == SQUASHFS_INVALID_BLK)
+                goto allocate_root;
+        /* Allocate and read inode lookup table */
+        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
+                lookup_table_start, msblk->inodes);
+        if (IS_ERR(msblk->inode_lookup_table)) {
+                err = PTR_ERR(msblk->inode_lookup_table);
+                msblk->inode_lookup_table = NULL;
+                goto failed_mount;
+        }
+        sb->s_export_op = &squashfs_export_ops;
+allocate_root:
+        root = new_inode(sb);
+        if (!root) {
+                err = -ENOMEM;
+                goto failed_mount;
+        }
+        err = squashfs_read_inode(root, root_inode);
+        if (err) {
+                iget_failed(root);
+                goto failed_mount;
+        }
+        insert_inode_hash(root);
+        sb->s_root = d_alloc_root(root);
+        if (sb->s_root == NULL) {
+                ERROR("Root inode create failed\n");
+                err = -ENOMEM;
+                iput(root);
+                goto failed_mount;
+        }
+        TRACE("Leaving squashfs_fill_super\n");
+        kfree(sblk);
+        return 0;
+failed_mount:
+        squashfs_cache_delete(msblk->block_cache);
+        squashfs_cache_delete(msblk->fragment_cache);
+        squashfs_cache_delete(msblk->read_page);
+        kfree(msblk->inode_lookup_table);
+        kfree(msblk->fragment_index);
+        kfree(msblk->id_table);
+        kfree(msblk->stream.workspace);
+        kfree(sb->s_fs_info);
+        sb->s_fs_info = NULL;
+        kfree(sblk);
+        return err;
+failure:
+        kfree(msblk->stream.workspace);
+        kfree(sb->s_fs_info);
+        sb->s_fs_info = NULL;
+        return -ENOMEM;
+}
+static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+        TRACE("Entered squashfs_statfs\n");
+        buf->f_type = SQUASHFS_MAGIC;
+        buf->f_bsize = msblk->block_size;
+        buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
+        buf->f_bfree = buf->f_bavail = 0;
+        buf->f_files = msblk->inodes;
+        buf->f_ffree = 0;
+        buf->f_namelen = SQUASHFS_NAME_LEN;
+        return 0;
+}
+static int squashfs_remount(struct super_block *sb, int *flags, char *data)
+{
+        *flags |= MS_RDONLY;
+        return 0;
+}
+static void squashfs_put_super(struct super_block *sb)
+{
+        if (sb->s_fs_info) {
+                struct squashfs_sb_info *sbi = sb->s_fs_info;
+                squashfs_cache_delete(sbi->block_cache);
+                squashfs_cache_delete(sbi->fragment_cache);
+                squashfs_cache_delete(sbi->read_page);
+                kfree(sbi->id_table);
+                kfree(sbi->fragment_index);
+                kfree(sbi->meta_index);
+                kfree(sbi->stream.workspace);
+                kfree(sb->s_fs_info);
+                sb->s_fs_info = NULL;
+        }
+}
+static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
+                                const char *dev_name, void *data,
+                                struct vfsmount *mnt)
+{
+        return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
+                                mnt);
+}
+static struct kmem_cache *squashfs_inode_cachep;
+static void init_once(void *foo)
+{
+        struct squashfs_inode_info *ei = foo;
+        inode_init_once(&ei->vfs_inode);
+}
+static int __init init_inodecache(void)
+{
+        squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
+                sizeof(struct squashfs_inode_info), 0,
+                SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+        return squashfs_inode_cachep ? 0 : -ENOMEM;
+}
+static void destroy_inodecache(void)
+{
+        kmem_cache_destroy(squashfs_inode_cachep);
+}
+static int __init init_squashfs_fs(void)
+{
+        int err = init_inodecache();
+        if (err)
+                return err;
+        err = register_filesystem(&squashfs_fs_type);
+        if (err) {
+                destroy_inodecache();
+                return err;
+        }
+        printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
+                "Phillip Lougher\n");
+        return 0;
+}
+static void __exit exit_squashfs_fs(void)
+{
+        unregister_filesystem(&squashfs_fs_type);
+        destroy_inodecache();
+}
+static struct inode *squashfs_alloc_inode(struct super_block *sb)
+{
+        struct squashfs_inode_info *ei =
+                kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
+        return ei ? &ei->vfs_inode : NULL;
+}
+static void squashfs_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
+}
+static struct file_system_type squashfs_fs_type = {
+        .owner = THIS_MODULE,
+        .name = "squashfs",
+        .get_sb = squashfs_get_sb,
+        .kill_sb = kill_block_super,
+        .fs_flags = FS_REQUIRES_DEV
+};
+static struct super_operations squashfs_super_ops = {
+        .alloc_inode = squashfs_alloc_inode,
+        .destroy_inode = squashfs_destroy_inode,
+        .statfs = squashfs_statfs,
+        .put_super = squashfs_put_super,
+        .remount_fs = squashfs_remount
+};
+module_init(init_squashfs_fs);
+module_exit(exit_squashfs_fs);
+MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
+MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>");
+MODULE_LICENSE("GPL");
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
new file mode 100644
index 000000000000..83d87880aac8
--- /dev/null
+++ b/fs/squashfs/symlink.c
@@ -0,0 +1,118 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * symlink.c
+ */
+/*
+ * This file implements code to handle symbolic links.
+ *
+ * The data contents of symbolic links are stored inside the symbolic
+ * link inode within the inode table.  This allows the normally small symbolic
+ * link to be compressed as part of the inode table, achieving much greater
+ * compression than if the symbolic link was compressed individually.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/zlib.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static int squashfs_symlink_readpage(struct file *file, struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int index = page->index << PAGE_CACHE_SHIFT;
+        u64 block = squashfs_i(inode)->start;
+        int offset = squashfs_i(inode)->offset;
+        int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
+        int bytes, copied;
+        void *pageaddr;
+        struct squashfs_cache_entry *entry;
+        TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
+                        "%llx, offset %x\n", page->index, block, offset);
+        /*
+         * Skip index bytes into symlink metadata.
+         */
+        if (index) {
+                bytes = squashfs_read_metadata(sb, NULL, &block, &offset,
+                                                                index);
+                if (bytes < 0) {
+                        ERROR("Unable to read symlink [%llx:%x]\n",
+                                squashfs_i(inode)->start,
+                                squashfs_i(inode)->offset);
+                        goto error_out;
+                }
+        }
+        /*
+         * Read length bytes from symlink metadata.  Squashfs_read_metadata
+         * is not used here because it can sleep and we want to use
+         * kmap_atomic to map the page.  Instead call the underlying
+         * squashfs_cache_get routine.  As length bytes may overlap metadata
+         * blocks, we may need to call squashfs_cache_get multiple times.
+         */
+        for (bytes = 0; bytes < length; offset = 0, bytes += copied) {
+                entry = squashfs_cache_get(sb, msblk->block_cache, block, 0);
+                if (entry->error) {
+                        ERROR("Unable to read symlink [%llx:%x]\n",
+                                squashfs_i(inode)->start,
+                                squashfs_i(inode)->offset);
+                        squashfs_cache_put(entry);
+                        goto error_out;
+                }
+                pageaddr = kmap_atomic(page, KM_USER0);
+                copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
+                                                                length - bytes);
+                if (copied == length - bytes)
+                        memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
+                else
+                        block = entry->next_index;
+                kunmap_atomic(pageaddr, KM_USER0);
+                squashfs_cache_put(entry);
+        }
+        flush_dcache_page(page);
+        SetPageUptodate(page);
+        unlock_page(page);
+        return 0;
+error_out:
+        SetPageError(page);
+        unlock_page(page);
+        return 0;
+}
+const struct address_space_operations squashfs_symlink_aops = {
+        .readpage = squashfs_symlink_readpage
+};
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b76..7e12a6f82795 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
                struct inode *inode = path.dentry->d_inode;
                error = -EINVAL;
-                if (inode->i_op && inode->i_op->readlink) {
+                if (inode->i_op->readlink) {
                        error = security_inode_readlink(path.dentry);
                        if (!error) {
                                touch_atime(path.mnt, path.dentry);
diff --git a/fs/super.c b/fs/super.c
index ddba069d7a99..ed080c417167 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
+                INIT_LIST_HEAD(&s->s_async_list);
                init_rwsem(&s->s_umount);
                mutex_init(&s->s_lock);
                lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb)
 {
        const struct super_operations *sop = sb->s_op;
        if (sb->s_root) {
                shrink_dcache_for_umount(sb);
                fsync_super(sb);
                lock_super(sb);
                sb->s_flags &= ~MS_ACTIVE;
+                /*
+                 * wait for asynchronous fs operations to finish before going further
+                 */
+                async_synchronize_full_special(&sb->s_async_list);
                /* bad name - it should be evict_inodes() */
                invalidate_inodes(sb);
                lock_kernel();
@@ -461,6 +470,7 @@ restart:
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
+                async_synchronize_full_special(&sb->s_async_list);
                if (sb->s_root && (wait || sb->s_dirt))
                        sb->s_op->sync_fs(sb, wait);
                up_read(&sb->s_umount);
@@ -800,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
                }
                s->s_flags |= MS_ACTIVE;
+                bdev->bd_super = s;
        }
        return simple_set_mnt(mnt, s);
@@ -819,6 +830,7 @@ void kill_block_super(struct super_block *sb)
        struct block_device *bdev = sb->s_bdev;
        fmode_t mode = sb->s_mode;
+        bdev->bd_super = 0;
        generic_shutdown_super(sb);
        sync_blockdev(bdev);
        close_bdev_exclusive(bdev, mode);
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416f..ac02b56548bc 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
        return ret;
 }
-long do_fsync(struct file *file, int datasync)
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:               file to sync
+ * @dentry:             dentry of @file
+ * @data:               only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-        int ret;
+        const struct file_operations *fop;
-        int err;
+        struct address_space *mapping;
-        struct address_space *mapping = file->f_mapping;
+        int err, ret;
+        /*
+         * Get mapping and operations from the file in case we have
+         * as file, or get the default values for them in case we
+         * don't have a struct file available.  Damn nfsd..
+         */
+        if (file) {
+                mapping = file->f_mapping;
+                fop = file->f_op;
+        } else {
+                mapping = dentry->d_inode->i_mapping;
+                fop = dentry->d_inode->i_fop;
+        }
-        if (!file->f_op || !file->f_op->fsync) {
+        if (!fop || !fop->fsync) {
-                /* Why?  We can still call filemap_fdatawrite */
                ret = -EINVAL;
                goto out;
        }
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
         * livelocks in fsync_buffers_list().
         */
        mutex_lock(&mapping->host->i_mutex);
-        err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+        err = fop->fsync(file, dentry, datasync);
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
@@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync)
 out:
        return ret;
 }
+EXPORT_SYMBOL(vfs_fsync);
-static long __do_fsync(unsigned int fd, int datasync)
+static int do_fsync(unsigned int fd, int datasync)
 {
        struct file *file;
        int ret = -EBADF;
        file = fget(fd);
        if (file) {
-                ret = do_fsync(file, datasync);
+                ret = vfs_fsync(file, file->f_path.dentry, datasync);
                fput(file);
        }
        return ret;
@@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync)
 asmlinkage long sys_fsync(unsigned int fd)
 {
-        return __do_fsync(fd, 0);
+        return do_fsync(fd, 0);
 }
 asmlinkage long sys_fdatasync(unsigned int fd)
 {
-        return __do_fsync(fd, 1);
+        return do_fsync(fd, 1);
 }
 /*
@@ -269,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
        if (flags & SYNC_FILE_RANGE_WRITE) {
                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-                                                WB_SYNC_NONE);
+                                                WB_SYNC_ALL);
                if (ret < 0)
                        goto out;
        }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f856..dfa3d94cfc74 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
        inode->i_mode = mode;
-        inode->i_uid = 0;
-        inode->i_gid = 0;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
        struct bin_attribute *bin_attr;
-        inode->i_blocks = 0;
        inode->i_mapping->a_ops = &sysfs_aops;
        inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
        inode->i_op = &sysfs_inode_operations;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index df0d435baa48..3d81bf58dae2 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -27,6 +27,7 @@
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/namei.h>
 #include <asm/byteorder.h>
 #include "sysv.h"
@@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
                if (inode->i_blocks) {
                        inode->i_op = &sysv_symlink_inode_operations;
                        inode->i_mapping->a_ops = &sysv_aops;
-                } else
+                } else {
                        inode->i_op = &sysv_fast_symlink_inode_operations;
+                        nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+                                sizeof(SYSV_I(inode)->i_data) - 1);
+                }
        } else
                init_special_inode(inode, inode->i_mode, rdev);
 }
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 91ceeda7e5bf..e35b54d5059d 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB
        depends on UBIFS_FS
        default y
        help
-          Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+          Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
 # Debugging-related stuff
 config UBIFS_FS_DEBUG
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 4a18f084cc42..175f9c590b77 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -32,18 +32,15 @@
 #include "ubifs.h"
 #include <linux/writeback.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
 /*
 * When pessimistic budget calculations say that there is no enough space,
 * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
- * or committing. The below constants define maximum number of times UBIFS
+ * or committing. The below constant defines maximum number of times UBIFS
 * repeats the operations.
 */
-#define MAX_SHRINK_RETRIES 8
+#define MAX_MKSPC_RETRIES 3
-#define MAX_GC_RETRIES     4
-#define MAX_CMT_RETRIES    2
-#define MAX_NOSPC_RETRIES  1
 /*
 * The below constant defines amount of dirty pages which should be written
@@ -52,30 +49,6 @@
 #define NR_TO_WRITE 16
 /**
- * struct retries_info - information about re-tries while making free space.
- * @prev_liability: previous liability
- * @shrink_cnt: how many times the liability was shrinked
- * @shrink_retries: count of liability shrink re-tries (increased when
- *                  liability does not shrink)
- * @try_gc: GC should be tried first
- * @gc_retries: how many times GC was run
- * @cmt_retries: how many times commit has been done
- * @nospc_retries: how many times GC returned %-ENOSPC
- *
- * Since we consider budgeting to be the fast-path, and this structure has to
- * be allocated on stack and zeroed out, we make it smaller using bit-fields.
- */
-struct retries_info {
-        long long prev_liability;
-        unsigned int shrink_cnt;
-        unsigned int shrink_retries:5;
-        unsigned int try_gc:1;
-        unsigned int gc_retries:4;
-        unsigned int cmt_retries:3;
-        unsigned int nospc_retries:1;
-};
-/**
 * shrink_liability - write-back some dirty pages/inodes.
 * @c: UBIFS file-system description object
 * @nr_to_write: how many dirty pages to write-back
@@ -147,13 +120,29 @@ static int run_gc(struct ubifs_info *c)
 }
 /**
+ * get_liability - calculate current liability.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns current UBIFS liability, i.e. the
+ * amount of bytes UBIFS has "promised" to write to the media.
+ */
+static long long get_liability(struct ubifs_info *c)
+{
+        long long liab;
+        spin_lock(&c->space_lock);
+        liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+        spin_unlock(&c->space_lock);
+        return liab;
+}
+/**
 * make_free_space - make more free space on the file-system.
 * @c: UBIFS file-system description object
- * @ri: information about previous invocations of this function
 *
 * This function is called when an operation cannot be budgeted because there
 * is supposedly no free space. But in most cases there is some free space:
- *   o budgeting is pessimistic, so it always budgets more then it is actually
+ *   o budgeting is pessimistic, so it always budgets more than it is actually
 *     needed, so shrinking the liability is one way to make free space - the
 *     cached data will take less space then it was budgeted for;
 *   o GC may turn some dark space into free space (budgeting treats dark space
@@ -165,87 +154,42 @@ static int run_gc(struct ubifs_info *c)
 * Returns %-ENOSPC if it couldn't do more free space, and other negative error
 * codes on failures.
 */
-static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
+static int make_free_space(struct ubifs_info *c)
 {
-        int err;
+        int err, retries = 0;
+        long long liab1, liab2;
-        /*
-         * If we have some dirty pages and inodes (liability), try to write
-         * them back unless this was tried too many times without effect
-         * already.
-         */
-        if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
-                long long liability;
-                spin_lock(&c->space_lock);
-                liability = c->budg_idx_growth + c->budg_data_growth +
-                            c->budg_dd_growth;
-                spin_unlock(&c->space_lock);
-                if (ri->prev_liability >= liability) {
+        do {
-                        /* Liability does not shrink, next time try GC then */
+                liab1 = get_liability(c);
-                        ri->shrink_retries += 1;
+                /*
-                        if (ri->gc_retries < MAX_GC_RETRIES)
+                 * We probably have some dirty pages or inodes (liability), try
-                                ri->try_gc = 1;
+                 * to write them back.
-                        dbg_budg("liability did not shrink: retries %d of %d",
+                 */
-                                 ri->shrink_retries, MAX_SHRINK_RETRIES);
+                dbg_budg("liability %lld, run write-back", liab1);
-                }
+                shrink_liability(c, NR_TO_WRITE);
-                dbg_budg("force write-back (count %d)", ri->shrink_cnt);
+                liab2 = get_liability(c);
-                shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
+                if (liab2 < liab1)
+                        return -EAGAIN;
-                ri->prev_liability = liability;
+                dbg_budg("new liability %lld (not shrinked)", liab2);
-                ri->shrink_cnt += 1;
-                return -EAGAIN;
-        }
-        /*
+                /* Liability did not shrink again, try GC */
-         * Try to run garbage collector unless it was already tried too many
+                dbg_budg("Run GC");
-         * times.
-         */
-        if (ri->gc_retries < MAX_GC_RETRIES) {
-                ri->gc_retries += 1;
-                dbg_budg("run GC, retries %d of %d",
-                         ri->gc_retries, MAX_GC_RETRIES);
-                ri->try_gc = 0;
                err = run_gc(c);
                if (!err)
                        return -EAGAIN;
-                if (err == -EAGAIN) {
+                if (err != -EAGAIN && err != -ENOSPC)
-                        dbg_budg("GC asked to commit");
+                        /* Some real error happened */
-                        err = ubifs_run_commit(c);
-                        if (err)
-                                return err;
-                        return -EAGAIN;
-                }
-                if (err != -ENOSPC)
-                        return err;
-                /*
-                 * GC could not make any progress. If this is the first time,
-                 * then it makes sense to try to commit, because it might make
-                 * some dirty space.
-                 */
-                dbg_budg("GC returned -ENOSPC, retries %d",
-                         ri->nospc_retries);
-                if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
                        return err;
-                ri->nospc_retries += 1;
-        }
-        /* Neither GC nor write-back helped, try to commit */
+                dbg_budg("Run commit (retries %d)", retries);
-        if (ri->cmt_retries < MAX_CMT_RETRIES) {
-                ri->cmt_retries += 1;
-                dbg_budg("run commit, retries %d of %d",
-                         ri->cmt_retries, MAX_CMT_RETRIES);
                err = ubifs_run_commit(c);
                if (err)
                        return err;
-                return -EAGAIN;
+        } while (retries++ < MAX_MKSPC_RETRIES);
-        }
        return -ENOSPC;
 }
@@ -258,8 +202,8 @@ static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
 */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-        int ret;
+        int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
-        uint64_t idx_size;
+        long long idx_size;
        idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
@@ -271,23 +215,16 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
         * pair, nor similarly the two variables for the new index size, so we
         * have to do this costly 64-bit division on fast-path.
         */
-        if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
+        idx_size += eff_leb_size - 1;
-                ret = idx_size + 1;
+        idx_lebs = div_u64(idx_size, eff_leb_size);
-        else
-                ret = idx_size;
        /*
         * The index head is not available for the in-the-gaps method, so add an
         * extra LEB to compensate.
         */
-        ret += 1;
+        idx_lebs += 1;
-        /*
+        if (idx_lebs < MIN_INDEX_LEBS)
-         * At present the index needs at least 2 LEBs: one for the index head
+                idx_lebs = MIN_INDEX_LEBS;
-         * and one for in-the-gaps method (which currently does not cater for
+        return idx_lebs;
-         * the index head and so excludes it from consideration).
-         */
-        if (ret < 2)
-                ret = 2;
-        return ret;
 }
 /**
@@ -530,8 +467,7 @@ static int calc_dd_growth(const struct ubifs_info *c,
 int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
        int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
-        int err, idx_growth, data_growth, dd_growth;
+        int err, idx_growth, data_growth, dd_growth, retried = 0;
-        struct retries_info ri;
        ubifs_assert(req->new_page <= 1);
        ubifs_assert(req->dirtied_page <= 1);
@@ -549,7 +485,6 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
        if (!data_growth && !dd_growth)
                return 0;
        idx_growth = calc_idx_growth(c, req);
-        memset(&ri, 0, sizeof(struct retries_info));
 again:
        spin_lock(&c->space_lock);
@@ -587,12 +522,17 @@ again:
                return err;
        }
-        err = make_free_space(c, &ri);
+        err = make_free_space(c);
+        cond_resched();
        if (err == -EAGAIN) {
                dbg_budg("try again");
-                cond_resched();
                goto again;
        } else if (err == -ENOSPC) {
+                if (!retried) {
+                        retried = 1;
+                        dbg_budg("-ENOSPC, but anyway try once again");
+                        goto again;
+                }
                dbg_budg("FS is full, -ENOSPC");
                c->nospace = 1;
                if (can_use_rp(c) || c->rp_size == 0)
@@ -666,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
 * @c: UBIFS file-system description object
 *
 * This function converts budget which was allocated for a new page of data to
- * the budget of changing an existing page of data. The latter is smaller then
+ * the budget of changing an existing page of data. The latter is smaller than
 * the former, so this function only does simple re-calculation and does not
 * involve any write-back.
 */
@@ -712,9 +652,9 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 * user-space. User-space application tend to expect that if the file-system
 * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
 * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
+ * node and it has to write indexing nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * overhead, and UBIFS has to report slightly less free space to meet the above
- * above expectetion.
+ * expectations.
 *
 * This function assumes free space is made up of uncompressed data nodes and
 * full index nodes (one per data node, tripled because we always allow enough
@@ -723,7 +663,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
 * Note, the calculation is pessimistic, which means that most of the time
 * UBIFS reports less space than it actually has.
 */
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 {
        int divisor, factor, f;
@@ -737,7 +677,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
         * of data nodes, f - fanout. Because effective UBIFS fanout is twice
         * as less than maximum fanout, we assume that each data node
         * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
-         * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+         * Note, the multiplier 3 is because UBIFS reserves thrice as more space
         * for the index.
         */
        f = c->fanout > 3 ? c->fanout >> 1 : 2;
@@ -745,8 +685,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
        divisor = UBIFS_MAX_DATA_NODE_SZ;
        divisor += (c->max_idx_node_sz * 3) / (f - 1);
        free *= factor;
-        do_div(free, divisor);
+        return div_u64(free, divisor);
-        return free;
 }
 /**
@@ -756,10 +695,10 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
 * This function calculates amount of free space to report to user-space.
 *
 * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * alignment, wastage at the end of eraseblocks, etc), it cannot report real
 * amount of free flash space it has (well, because not all dirty space is
- * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectetion about what free space is. Users seem to
+ * it would bread user expectations about what free space is. Users seem to
 * accustomed to assume that if the file-system reports N bytes of free space,
 * they would be able to fit a file of N bytes to the FS. This almost works for
 * traditional file-systems, because they have way less overhead than UBIFS.
@@ -771,18 +710,9 @@ long long ubifs_get_free_space(struct ubifs_info *c)
        long long available, outstanding, free;
        spin_lock(&c->space_lock);
-        min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+        min_idx_lebs = c->min_idx_lebs;
+        ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
        outstanding = c->budg_data_growth + c->budg_dd_growth;
-        /*
-         * Force the amount available to the total size reported if the used
-         * space is zero.
-         */
-        if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
-                spin_unlock(&c->space_lock);
-                return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
-        }
        available = ubifs_calc_available(c, min_idx_lebs);
        /*
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index b49884c8c10e..f3a7945527fb 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -470,12 +470,12 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 {
        struct ubifs_idx_node *idx;
        int lnum, offs, len, err = 0;
+        struct ubifs_debug_info *d = c->dbg;
-        c->old_zroot = *zroot;
+        d->old_zroot = *zroot;
+        lnum = d->old_zroot.lnum;
-        lnum = c->old_zroot.lnum;
+        offs = d->old_zroot.offs;
-        offs = c->old_zroot.offs;
+        len = d->old_zroot.len;
-        len = c->old_zroot.len;
        idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
        if (!idx)
@@ -485,8 +485,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
        if (err)
                goto out;
-        c->old_zroot_level = le16_to_cpu(idx->level);
+        d->old_zroot_level = le16_to_cpu(idx->level);
-        c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+        d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
 out:
        kfree(idx);
        return err;
@@ -509,6 +509,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 {
        int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
        int first = 1, iip;
+        struct ubifs_debug_info *d = c->dbg;
        union ubifs_key lower_key, upper_key, l_key, u_key;
        unsigned long long uninitialized_var(last_sqnum);
        struct ubifs_idx_node *idx;
@@ -525,9 +526,9 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
             UBIFS_IDX_NODE_SZ;
        /* Start at the old zroot */
-        lnum = c->old_zroot.lnum;
+        lnum = d->old_zroot.lnum;
-        offs = c->old_zroot.offs;
+        offs = d->old_zroot.offs;
-        len = c->old_zroot.len;
+        len = d->old_zroot.len;
        iip = 0;
        /*
@@ -560,11 +561,11 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
                if (first) {
                        first = 0;
                        /* Check root level and sqnum */
-                        if (le16_to_cpu(idx->level) != c->old_zroot_level) {
+                        if (le16_to_cpu(idx->level) != d->old_zroot_level) {
                                err = 2;
                                goto out_dump;
                        }
-                        if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
+                        if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) {
                                err = 3;
                                goto out_dump;
                        }
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index a0ada596b17c..11e4132f314a 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -33,7 +33,7 @@
 /* Fake description object for the "none" compressor */
 static struct ubifs_compressor none_compr = {
        .compr_type = UBIFS_COMPR_NONE,
-        .name = "no compression",
+        .name = "none",
        .capi_name = "",
 };
@@ -43,13 +43,13 @@ static DEFINE_MUTEX(lzo_mutex);
 static struct ubifs_compressor lzo_compr = {
        .compr_type = UBIFS_COMPR_LZO,
        .comp_mutex = &lzo_mutex,
-        .name = "LZO",
+        .name = "lzo",
        .capi_name = "lzo",
 };
 #else
 static struct ubifs_compressor lzo_compr = {
        .compr_type = UBIFS_COMPR_LZO,
-        .name = "LZO",
+        .name = "lzo",
 };
 #endif
@@ -108,7 +108,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
        if (compr->comp_mutex)
                mutex_lock(compr->comp_mutex);
        err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
-                                   out_len);
+                                   (unsigned int *)out_len);
        if (compr->comp_mutex)
                mutex_unlock(compr->comp_mutex);
        if (unlikely(err)) {
@@ -119,10 +119,10 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
        }
        /*
-         * Presently, we just require that compression results in less data,
+         * If the data compressed only slightly, it is better to leave it
-         * rather than any defined minimum compression ratio or amount.
+         * uncompressed to improve read speed.
         */
-        if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
+        if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
                goto no_compr;
        return;
@@ -172,7 +172,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
        if (compr->decomp_mutex)
                mutex_lock(compr->decomp_mutex);
        err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
-                                     out_len);
+                                     (unsigned int *)out_len);
        if (compr->decomp_mutex)
                mutex_unlock(compr->decomp_mutex);
        if (err)
@@ -244,7 +244,7 @@ out_lzo:
 /**
 * ubifs_compressors_exit - de-initialize UBIFS compressors.
 */
-void __exit ubifs_compressors_exit(void)
+void ubifs_compressors_exit(void)
 {
        compr_exit(&lzo_compr);
        compr_exit(&zlib_compr);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 510ffa0bbda4..792c5a16c182 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -32,6 +32,8 @@
 #include "ubifs.h"
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/math64.h>
 #ifdef CONFIG_UBIFS_FS_DEBUG
@@ -596,7 +598,9 @@ void dbg_dump_budg(struct ubifs_info *c)
        struct rb_node *rb;
        struct ubifs_bud *bud;
        struct ubifs_gced_idx_leb *idx_gc;
+        long long available, outstanding, free;
+        ubifs_assert(spin_is_locked(&c->space_lock));
        spin_lock(&dbg_lock);
        printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
               "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
@@ -629,6 +633,17 @@ void dbg_dump_budg(struct ubifs_info *c)
                printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
                       idx_gc->lnum, idx_gc->unmap);
        printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+        /* Print budgeting predictions */
+        available = ubifs_calc_available(c, c->min_idx_lebs);
+        outstanding = c->budg_data_growth + c->budg_dd_growth;
+        if (available > outstanding)
+                free = ubifs_reported_space(c, available - outstanding);
+        else
+                free = 0;
+        printk(KERN_DEBUG "Budgeting predictions:\n");
+        printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
+               available, outstanding, free);
        spin_unlock(&dbg_lock);
 }
@@ -645,7 +660,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
        struct ubifs_lprops lp;
        struct ubifs_lp_stats lst;
-        printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
+        printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
+               current->pid);
        ubifs_get_lp_stats(c, &lst);
        dbg_dump_lstats(&lst);
@@ -656,6 +672,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
                dbg_dump_lprop(c, &lp);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
+               current->pid);
 }
 void dbg_dump_lpt_info(struct ubifs_info *c)
@@ -663,6 +681,7 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
        int i;
        spin_lock(&dbg_lock);
+        printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
        printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz);
        printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz);
        printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz);
@@ -684,7 +703,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
        printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
        printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
               c->nhead_lnum, c->nhead_offs);
-        printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
+        printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
+               c->ltab_lnum, c->ltab_offs);
        if (c->big_lpt)
                printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
                       c->lsave_lnum, c->lsave_offs);
@@ -703,9 +723,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
        if (dbg_failure_mode)
                return;
-        printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
+        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+               current->pid, lnum);
-        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
        if (IS_ERR(sleb)) {
                ubifs_err("scan error %d", (int)PTR_ERR(sleb));
                return;
@@ -721,6 +741,8 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
                dbg_dump_node(c, snod->node);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+               current->pid, lnum);
        ubifs_scan_destroy(sleb);
        return;
 }
@@ -768,7 +790,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
        int i;
-        printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
+        printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
               current->pid, cat, heap->cnt);
        for (i = 0; i < heap->cnt; i++) {
                struct ubifs_lprops *lprops = heap->arr[i];
@@ -777,6 +799,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
                       "flags %d\n", i, lprops->lnum, lprops->hpos,
                       lprops->free, lprops->dirty, lprops->flags);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
 }
 void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -784,7 +807,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 {
        int i;
-        printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
+        printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
        printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
               (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
        printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -803,7 +826,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
        int level;
        printk(KERN_DEBUG "\n");
-        printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
+        printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
        znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
        level = znode->level;
        printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -815,8 +838,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
                dbg_dump_znode(c, znode);
                znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
        }
+        printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
-        printk(KERN_DEBUG "\n");
 }
 static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
@@ -992,8 +1014,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                        zbr1->offs, DBGKEY(&key));
                dbg_err("but it should have key %s according to tnc",
                        DBGKEY(&zbr1->key));
-                        dbg_dump_node(c, dent1);
+                dbg_dump_node(c, dent1);
-                        goto out_free;
+                goto out_free;
        }
        key_read(c, &dent2->key, &key);
@@ -1002,8 +1024,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                        zbr1->offs, DBGKEY(&key));
                dbg_err("but it should have key %s according to tnc",
                        DBGKEY(&zbr2->key));
-                        dbg_dump_node(c, dent2);
+                dbg_dump_node(c, dent2);
-                        goto out_free;
+                goto out_free;
        }
        nlen1 = le16_to_cpu(dent1->nlen);
@@ -1020,9 +1042,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
                dbg_err("bad order of colliding key %s",
                        DBGKEY(&key));
-        dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+        ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
        dbg_dump_node(c, dent1);
-        dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+        ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
        dbg_dump_node(c, dent2);
 out_free:
@@ -2097,13 +2119,13 @@ static int simple_rand(void)
        return (next >> 16) & 32767;
 }
-void dbg_failure_mode_registration(struct ubifs_info *c)
+static void failure_mode_init(struct ubifs_info *c)
 {
        struct failure_mode_info *fmi;
        fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
        if (!fmi) {
-                dbg_err("Failed to register failure mode - no memory");
+                ubifs_err("Failed to register failure mode - no memory");
                return;
        }
        fmi->c = c;
@@ -2112,7 +2134,7 @@ void dbg_failure_mode_registration(struct ubifs_info *c)
        spin_unlock(&fmi_lock);
 }
-void dbg_failure_mode_deregistration(struct ubifs_info *c)
+static void failure_mode_exit(struct ubifs_info *c)
 {
        struct failure_mode_info *fmi, *tmp;
@@ -2146,42 +2168,44 @@ static int in_failure_mode(struct ubi_volume_desc *desc)
        struct ubifs_info *c = dbg_find_info(desc);
        if (c && dbg_failure_mode)
-                return c->failure_mode;
+                return c->dbg->failure_mode;
        return 0;
 }
 static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
 {
        struct ubifs_info *c = dbg_find_info(desc);
+        struct ubifs_debug_info *d;
        if (!c || !dbg_failure_mode)
                return 0;
-        if (c->failure_mode)
+        d = c->dbg;
+        if (d->failure_mode)
                return 1;
-        if (!c->fail_cnt) {
+        if (!d->fail_cnt) {
                /* First call - decide delay to failure */
                if (chance(1, 2)) {
                        unsigned int delay = 1 << (simple_rand() >> 11);
                        if (chance(1, 2)) {
-                                c->fail_delay = 1;
+                                d->fail_delay = 1;
-                                c->fail_timeout = jiffies +
+                                d->fail_timeout = jiffies +
                                                  msecs_to_jiffies(delay);
                                dbg_rcvry("failing after %ums", delay);
                        } else {
-                                c->fail_delay = 2;
+                                d->fail_delay = 2;
-                                c->fail_cnt_max = delay;
+                                d->fail_cnt_max = delay;
                                dbg_rcvry("failing after %u calls", delay);
                        }
                }
-                c->fail_cnt += 1;
+                d->fail_cnt += 1;
        }
        /* Determine if failure delay has expired */
-        if (c->fail_delay == 1) {
+        if (d->fail_delay == 1) {
-                if (time_before(jiffies, c->fail_timeout))
+                if (time_before(jiffies, d->fail_timeout))
                        return 0;
-        } else if (c->fail_delay == 2)
+        } else if (d->fail_delay == 2)
-                if (c->fail_cnt++ < c->fail_cnt_max)
+                if (d->fail_cnt++ < d->fail_cnt_max)
                        return 0;
        if (lnum == UBIFS_SB_LNUM) {
                if (write) {
@@ -2239,7 +2263,7 @@ static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
                dbg_rcvry("failing in bud LEB %d commit not running", lnum);
        }
        ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
-        c->failure_mode = 1;
+        d->failure_mode = 1;
        dump_stack();
        return 1;
 }
@@ -2344,4 +2368,181 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
        return 0;
 }
+/**
+ * ubifs_debugging_init - initialize UBIFS debugging.
+ * @c: UBIFS file-system description object
+ *
+ * This function initializes debugging-related data for the file system.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_debugging_init(struct ubifs_info *c)
+{
+        c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
+        if (!c->dbg)
+                return -ENOMEM;
+        c->dbg->buf = vmalloc(c->leb_size);
+        if (!c->dbg->buf)
+                goto out;
+        failure_mode_init(c);
+        return 0;
+out:
+        kfree(c->dbg);
+        return -ENOMEM;
+}
+/**
+ * ubifs_debugging_exit - free debugging data.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_debugging_exit(struct ubifs_info *c)
+{
+        failure_mode_exit(c);
+        vfree(c->dbg->buf);
+        kfree(c->dbg);
+}
+/*
+ * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
+ * contain the stuff specific to particular file-system mounts.
+ */
+static struct dentry *debugfs_rootdir;
+/**
+ * dbg_debugfs_init - initialize debugfs file-system.
+ *
+ * UBIFS uses debugfs file-system to expose various debugging knobs to
+ * user-space. This function creates "ubifs" directory in the debugfs
+ * file-system. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int dbg_debugfs_init(void)
+{
+        debugfs_rootdir = debugfs_create_dir("ubifs", NULL);
+        if (IS_ERR(debugfs_rootdir)) {
+                int err = PTR_ERR(debugfs_rootdir);
+                ubifs_err("cannot create \"ubifs\" debugfs directory, "
+                          "error %d\n", err);
+                return err;
+        }
+        return 0;
+}
+/**
+ * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
+ */
+void dbg_debugfs_exit(void)
+{
+        debugfs_remove(debugfs_rootdir);
+}
+static int open_debugfs_file(struct inode *inode, struct file *file)
+{
+        file->private_data = inode->i_private;
+        return 0;
+}
+static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
+                                  size_t count, loff_t *ppos)
+{
+        struct ubifs_info *c = file->private_data;
+        struct ubifs_debug_info *d = c->dbg;
+        if (file->f_path.dentry == d->dump_lprops)
+                dbg_dump_lprops(c);
+        else if (file->f_path.dentry == d->dump_budg) {
+                spin_lock(&c->space_lock);
+                dbg_dump_budg(c);
+                spin_unlock(&c->space_lock);
+        } else if (file->f_path.dentry == d->dump_tnc) {
+                mutex_lock(&c->tnc_mutex);
+                dbg_dump_tnc(c);
+                mutex_unlock(&c->tnc_mutex);
+        } else
+                return -EINVAL;
+        *ppos += count;
+        return count;
+}
+static const struct file_operations debugfs_fops = {
+        .open = open_debugfs_file,
+        .write = write_debugfs_file,
+        .owner = THIS_MODULE,
+};
+/**
+ * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates all debugfs files for this instance of UBIFS. Returns
+ * zero in case of success and a negative error code in case of failure.
+ *
+ * Note, the only reason we have not merged this function with the
+ * 'ubifs_debugging_init()' function is because it is better to initialize
+ * debugfs interfaces at the very end of the mount process, and remove them at
+ * the very beginning of the mount process.
+ */
+int dbg_debugfs_init_fs(struct ubifs_info *c)
+{
+        int err;
+        const char *fname;
+        struct dentry *dent;
+        struct ubifs_debug_info *d = c->dbg;
+        sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+        d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name,
+                                              debugfs_rootdir);
+        if (IS_ERR(d->debugfs_dir)) {
+                err = PTR_ERR(d->debugfs_dir);
+                ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+                          d->debugfs_dir_name, err);
+                goto out;
+        }
+        fname = "dump_lprops";
+        dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+                                   &debugfs_fops);
+        if (IS_ERR(dent))
+                goto out_remove;
+        d->dump_lprops = dent;
+        fname = "dump_budg";
+        dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+                                   &debugfs_fops);
+        if (IS_ERR(dent))
+                goto out_remove;
+        d->dump_budg = dent;
+        fname = "dump_tnc";
+        dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+                                   &debugfs_fops);
+        if (IS_ERR(dent))
+                goto out_remove;
+        d->dump_tnc = dent;
+        return 0;
+out_remove:
+        err = PTR_ERR(dent);
+        ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+                  fname, err);
+        debugfs_remove_recursive(d->debugfs_dir);
+out:
+        return err;
+}
+/**
+ * dbg_debugfs_exit_fs - remove all debugfs files.
+ * @c: UBIFS file-system description object
+ */
+void dbg_debugfs_exit_fs(struct ubifs_info *c)
+{
+        debugfs_remove_recursive(c->dbg->debugfs_dir);
+}
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 33d6b95071e4..9820d6999f7e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -25,7 +25,56 @@
 #ifdef CONFIG_UBIFS_FS_DEBUG
-#define UBIFS_DBG(op) op
+/**
+ * ubifs_debug_info - per-FS debugging information.
+ * @buf: a buffer of LEB size, used for various purposes
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ * @chk_lpt_sz: used by LPT tree size checker
+ * @chk_lpt_sz2: used by LPT tree size checker
+ * @chk_lpt_wastage: used by LPT tree size checker
+ * @chk_lpt_lebs: used by LPT tree size checker
+ * @new_nhead_offs: used by LPT tree size checker
+ * @new_ihead_lnum: used by debugging to check ihead_lnum
+ * @new_ihead_offs: used by debugging to check ihead_offs
+ *
+ * debugfs_dir_name: name of debugfs directory containing this file-system's
+ *                   files
+ * debugfs_dir: direntry object of the file-system debugfs directory
+ * dump_lprops: "dump lprops" debugfs knob
+ * dump_budg: "dump budgeting information" debugfs knob
+ * dump_tnc: "dump TNC" debugfs knob
+ */
+struct ubifs_debug_info {
+        void *buf;
+        struct ubifs_zbranch old_zroot;
+        int old_zroot_level;
+        unsigned long long old_zroot_sqnum;
+        int failure_mode;
+        int fail_delay;
+        unsigned long fail_timeout;
+        unsigned int fail_cnt;
+        unsigned int fail_cnt_max;
+        long long chk_lpt_sz;
+        long long chk_lpt_sz2;
+        long long chk_lpt_wastage;
+        int chk_lpt_lebs;
+        int new_nhead_offs;
+        int new_ihead_lnum;
+        int new_ihead_offs;
+        char debugfs_dir_name[100];
+        struct dentry *debugfs_dir;
+        struct dentry *dump_lprops;
+        struct dentry *dump_budg;
+        struct dentry *dump_tnc;
+};
 #define ubifs_assert(expr) do {                                                \
        if (unlikely(!(expr))) {                                               \
@@ -211,14 +260,18 @@ extern unsigned int ubifs_msg_flags;
 extern unsigned int ubifs_chk_flags;
 extern unsigned int ubifs_tst_flags;
-/* Dump functions */
+int ubifs_debugging_init(struct ubifs_info *c);
+void ubifs_debugging_exit(struct ubifs_info *c);
+/* Dump functions */
 const char *dbg_ntype(int type);
 const char *dbg_cstate(int cmt_state);
 const char *dbg_get_key_dump(const struct ubifs_info *c,
                             const union ubifs_key *key);
 void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
 void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
+                       int offs);
 void dbg_dump_budget_req(const struct ubifs_budget_req *req);
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
 void dbg_dump_budg(struct ubifs_info *c);
@@ -233,9 +286,9 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
                    struct ubifs_nnode *parent, int iip);
 void dbg_dump_tnc(struct ubifs_info *c);
 void dbg_dump_index(struct ubifs_info *c);
+void dbg_dump_lpt_lebs(const struct ubifs_info *c);
 /* Checking helper functions */
 typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
                                 struct ubifs_zbranch *zbr, void *priv);
 typedef int (*dbg_znode_callback)(struct ubifs_info *c,
@@ -274,9 +327,6 @@ int dbg_force_in_the_gaps(void);
 #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
-void dbg_failure_mode_registration(struct ubifs_info *c);
-void dbg_failure_mode_deregistration(struct ubifs_info *c);
 #ifndef UBIFS_DBG_PRESERVE_UBI
 #define ubi_leb_read   dbg_leb_read
@@ -318,9 +368,13 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
        return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
 }
-#else /* !CONFIG_UBIFS_FS_DEBUG */
+/* Debugfs-related stuff */
+int dbg_debugfs_init(void);
+void dbg_debugfs_exit(void);
+int dbg_debugfs_init_fs(struct ubifs_info *c);
+void dbg_debugfs_exit_fs(struct ubifs_info *c);
-#define UBIFS_DBG(op)
+#else /* !CONFIG_UBIFS_FS_DEBUG */
 /* Use "if (0)" to make compiler check arguments even if debugging is off */
 #define ubifs_assert(expr)  do {                                               \
@@ -360,23 +414,28 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
-#define dbg_ntype(type)                       ""
+#define ubifs_debugging_init(c)                0
-#define dbg_cstate(cmt_state)                 ""
+#define ubifs_debugging_exit(c)                ({})
-#define dbg_get_key_dump(c, key)              ({})
-#define dbg_dump_inode(c, inode)              ({})
+#define dbg_ntype(type)                        ""
-#define dbg_dump_node(c, node)                ({})
+#define dbg_cstate(cmt_state)                  ""
-#define dbg_dump_budget_req(req)              ({})
+#define dbg_get_key_dump(c, key)               ({})
-#define dbg_dump_lstats(lst)                  ({})
+#define dbg_dump_inode(c, inode)               ({})
-#define dbg_dump_budg(c)                      ({})
+#define dbg_dump_node(c, node)                 ({})
-#define dbg_dump_lprop(c, lp)                 ({})
+#define dbg_dump_lpt_node(c, node, lnum, offs) ({})
-#define dbg_dump_lprops(c)                    ({})
+#define dbg_dump_budget_req(req)               ({})
-#define dbg_dump_lpt_info(c)                  ({})
+#define dbg_dump_lstats(lst)                   ({})
-#define dbg_dump_leb(c, lnum)                 ({})
+#define dbg_dump_budg(c)                       ({})
-#define dbg_dump_znode(c, znode)              ({})
+#define dbg_dump_lprop(c, lp)                  ({})
-#define dbg_dump_heap(c, heap, cat)           ({})
+#define dbg_dump_lprops(c)                     ({})
-#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_lpt_info(c)                   ({})
-#define dbg_dump_tnc(c)                       ({})
+#define dbg_dump_leb(c, lnum)                  ({})
-#define dbg_dump_index(c)                     ({})
+#define dbg_dump_znode(c, znode)               ({})
+#define dbg_dump_heap(c, heap, cat)            ({})
+#define dbg_dump_pnode(c, pnode, parent, iip)  ({})
+#define dbg_dump_tnc(c)                        ({})
+#define dbg_dump_index(c)                      ({})
+#define dbg_dump_lpt_lebs(c)                   ({})
 #define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
@@ -396,9 +455,11 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
-#define dbg_failure_mode_registration(c)           ({})
-#define dbg_failure_mode_deregistration(c)         ({})
-#endif /* !CONFIG_UBIFS_FS_DEBUG */
+#define dbg_debugfs_init()                         0
+#define dbg_debugfs_exit()
+#define dbg_debugfs_init_fs(c)                     0
+#define dbg_debugfs_exit_fs(c)                     0
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
 #endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2624411d9758..bf37374567fa 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
                return err;
        }
-        ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum);
+        ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+                     ubifs_inode(inode)->creat_sqnum);
        len = le32_to_cpu(dn->size);
        if (len <= 0 || len > UBIFS_BLOCK_SIZE)
                goto dump;
@@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
 }
 static int write_begin_slow(struct address_space *mapping,
-                            loff_t pos, unsigned len, struct page **pagep)
+                            loff_t pos, unsigned len, struct page **pagep,
+                            unsigned flags)
 {
        struct inode *inode = mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -247,14 +248,14 @@ static int write_begin_slow(struct address_space *mapping,
        if (unlikely(err))
                return err;
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (unlikely(!page)) {
                ubifs_release_budget(c, &req);
                return -ENOMEM;
        }
        if (!PageUptodate(page)) {
-                if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
                        SetPageChecked(page);
                else {
                        err = do_readpage(page);
@@ -438,13 +439,13 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                return -EROFS;
        /* Try out the fast-path part first */
-        page = __grab_cache_page(mapping, index);
+        page = grab_cache_page_write_begin(mapping, index, flags);
        if (unlikely(!page))
                return -ENOMEM;
        if (!PageUptodate(page)) {
                /* The page is not loaded from the flash */
-                if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+                if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
                        /*
                         * We change whole page so no need to load it. But we
                         * have to set the @PG_checked flag to make the further
@@ -483,7 +484,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
-                return write_begin_slow(mapping, pos, len, pagep);
+                return write_begin_slow(mapping, pos, len, pagep, flags);
        }
        /*
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 0bef6501d58a..9832f9abe28e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -45,7 +45,7 @@
 #define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
 /*
- * GC may need to move more then one LEB to make progress. The below constants
+ * GC may need to move more than one LEB to make progress. The below constants
 * define "soft" and "hard" limits on the number of LEBs the garbage collector
 * may move.
 */
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 5e82cffe9695..6db7a6be6c97 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -154,6 +154,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case FS_IOC_GETFLAGS:
                flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+                dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
                return put_user(flags, (int __user *) arg);
        case FS_IOC_SETFLAGS: {
@@ -176,6 +177,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
                err = mnt_want_write(file->f_path.mnt);
                if (err)
                        return err;
+                dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
                err = setflags(inode, flags);
                mnt_drop_write(file->f_path.mnt);
                return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f91b745908ea..9b7c54e0cd2a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -191,7 +191,7 @@ again:
        if (wbuf->lnum != -1 && avail >= len) {
                /*
                 * Someone else has switched the journal head and we have
-                 * enough space now. This happens when more then one process is
+                 * enough space now. This happens when more than one process is
                 * trying to write to the same journal head at the same time.
                 */
                dbg_jnl("return LEB %d back, already have LEB %d:%d",
@@ -704,7 +704,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
        data->size = cpu_to_le32(len);
        zero_data_node_unused(data);
-        if (!(ui->flags && UBIFS_COMPR_FL))
+        if (!(ui->flags & UBIFS_COMPR_FL))
                /* Compression is disabled for this inode */
                compr_type = UBIFS_COMPR_NONE;
        else
@@ -1220,7 +1220,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
        data_key_init(c, &key, inum, blk);
        bit = old_size & (UBIFS_BLOCK_SIZE - 1);
-        blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
+        blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1);
        data_key_init(c, &to_key, inum, blk);
        err = ubifs_tnc_remove_range(c, &key, &to_key);
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 3f1f16bc25c9..efb3430a2581 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -38,6 +38,22 @@
 #define __UBIFS_KEY_H__
 /**
+ * key_mask_hash - mask a valid hash value.
+ * @val: value to be masked
+ *
+ * We use hash values as offset in directories, so values %0 and %1 are
+ * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This
+ * function makes sure the reserved values are not used.
+ */
+static inline uint32_t key_mask_hash(uint32_t hash)
+{
+        hash &= UBIFS_S_KEY_HASH_MASK;
+        if (unlikely(hash <= 2))
+                hash += 3;
+        return hash;
+}
+/**
 * key_r5_hash - R5 hash function (borrowed from reiserfs).
 * @s: direntry name
 * @len: name length
@@ -54,16 +70,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
                str++;
        }
-        a &= UBIFS_S_KEY_HASH_MASK;
+        return key_mask_hash(a);
-        /*
-         * We use hash values as offset in directories, so values %0 and %1 are
-         * reserved for "." and "..". %2 is reserved for "end of readdir"
-         * marker.
-         */
-        if (unlikely(a >= 0 && a <= 2))
-                a += 3;
-        return a;
 }
 /**
@@ -77,10 +84,7 @@ static inline uint32_t key_test_hash(const char *str, int len)
        len = min_t(uint32_t, len, 4);
        memcpy(&a, str, len);
-        a &= UBIFS_S_KEY_HASH_MASK;
+        return key_mask_hash(a);
-        if (unlikely(a >= 0 && a <= 2))
-                a += 3;
-        return a;
 }
 /**
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index f27176e9b70d..dfd2bcece27a 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -520,13 +520,13 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
 * @flags: new flags
 * @idx_gc_cnt: change to the count of idx_gc list
 *
- * This function changes LEB properties. This function does not change a LEB
+ * This function changes LEB properties (@free, @dirty or @flag). However, the
- * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
+ * property which has the %LPROPS_NC value is not changed. Returns a pointer to
+ * the updated LEB properties on success and a negative error code on failure.
 *
- * This function returns a pointer to the updated LEB properties on success
+ * Note, the LEB properties may have had to be copied (due to COW) and
- * and a negative error code on failure. N.B. the LEB properties may have had to
+ * consequently the pointer returned may not be the same as the pointer
- * be copied (due to COW) and consequently the pointer returned may not be the
+ * passed.
- * same as the pointer passed.
 */
 const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
                                           const struct ubifs_lprops *lp,
@@ -1088,7 +1088,7 @@ static int scan_check_cb(struct ubifs_info *c,
                }
        }
-        sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+        sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
        if (IS_ERR(sleb)) {
                /*
                 * After an unclean unmount, empty and freeable LEBs
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index db8bd0e518b2..b2792e84d245 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -36,15 +36,16 @@
 * can be written into a single eraseblock. In that case, garbage collection
 * consists of just writing the whole table, which therefore makes all other
 * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
- * selected for garbage collection, which consists are marking the nodes in
+ * selected for garbage collection, which consists of marking the clean nodes in
 * that LEB as dirty, and then only the dirty nodes are written out. Also, in
 * the case of the big model, a table of LEB numbers is saved so that the entire
 * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
 * mounted.
 */
-#include <linux/crc16.h>
 #include "ubifs.h"
+#include <linux/crc16.h>
+#include <linux/math64.h>
 /**
 * do_calc_lpt_geom - calculate sizes for the LPT area.
@@ -135,15 +136,13 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
 int ubifs_calc_lpt_geom(struct ubifs_info *c)
 {
        int lebs_needed;
-        uint64_t sz;
+        long long sz;
        do_calc_lpt_geom(c);
        /* Verify that lpt_lebs is big enough */
        sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
-        sz += c->leb_size - 1;
+        lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
-        do_div(sz, c->leb_size);
-        lebs_needed = sz;
        if (lebs_needed > c->lpt_lebs) {
                ubifs_err("too few LPT LEBs");
                return -EINVAL;
@@ -156,7 +155,6 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c)
        }
        c->check_lpt_free = c->big_lpt;
        return 0;
 }
@@ -176,7 +174,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
                              int *big_lpt)
 {
        int i, lebs_needed;
-        uint64_t sz;
+        long long sz;
        /* Start by assuming the minimum number of LPT LEBs */
        c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
@@ -203,9 +201,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
        /* Now check there are enough LPT LEBs */
        for (i = 0; i < 64 ; i++) {
                sz = c->lpt_sz * 4; /* Allow 4 times the size */
-                sz += c->leb_size - 1;
+                lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
-                do_div(sz, c->leb_size);
-                lebs_needed = sz;
                if (lebs_needed > c->lpt_lebs) {
                        /* Not enough LPT LEBs so try again with more */
                        c->lpt_lebs = lebs_needed;
@@ -558,7 +554,7 @@ static int calc_nnode_num(int row, int col)
 * This function calculates and returns the nnode number based on the parent's
 * nnode number and the index in parent.
 */
-static int calc_nnode_num_from_parent(struct ubifs_info *c,
+static int calc_nnode_num_from_parent(const struct ubifs_info *c,
                                      struct ubifs_nnode *parent, int iip)
 {
        int num, shft;
@@ -583,7 +579,7 @@ static int calc_nnode_num_from_parent(struct ubifs_info *c,
 * This function calculates and returns the pnode number based on the parent's
 * nnode number and the index in parent.
 */
-static int calc_pnode_num_from_parent(struct ubifs_info *c,
+static int calc_pnode_num_from_parent(const struct ubifs_info *c,
                                      struct ubifs_nnode *parent, int iip)
 {
        int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
@@ -966,7 +962,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type)
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_pnode(struct ubifs_info *c, void *buf,
+static int unpack_pnode(const struct ubifs_info *c, void *buf,
                        struct ubifs_pnode *pnode)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
@@ -996,15 +992,15 @@ static int unpack_pnode(struct ubifs_info *c, void *buf,
 }
 /**
- * unpack_nnode - unpack a nnode.
+ * ubifs_unpack_nnode - unpack a nnode.
 * @c: UBIFS file-system description object
 * @buf: buffer containing packed nnode to unpack
 * @nnode: nnode structure to fill
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_nnode(struct ubifs_info *c, void *buf,
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
-                        struct ubifs_nnode *nnode)
+                       struct ubifs_nnode *nnode)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int i, pos = 0, err;
@@ -1036,7 +1032,7 @@ static int unpack_nnode(struct ubifs_info *c, void *buf,
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_ltab(struct ubifs_info *c, void *buf)
+static int unpack_ltab(const struct ubifs_info *c, void *buf)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int i, pos = 0, err;
@@ -1068,7 +1064,7 @@ static int unpack_ltab(struct ubifs_info *c, void *buf)
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int unpack_lsave(struct ubifs_info *c, void *buf)
+static int unpack_lsave(const struct ubifs_info *c, void *buf)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int i, pos = 0, err;
@@ -1096,7 +1092,7 @@ static int unpack_lsave(struct ubifs_info *c, void *buf)
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
+static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode,
                          struct ubifs_nnode *parent, int iip)
 {
        int i, lvl, max_offs;
@@ -1140,7 +1136,7 @@ static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
 *
 * This function returns %0 on success and a negative error code on failure.
 */
-static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode,
                          struct ubifs_nnode *parent, int iip)
 {
        int i;
@@ -1174,7 +1170,8 @@ static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 * This function calculates the LEB numbers for the LEB properties it contains
 * based on the pnode number.
 */
-static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
+static void set_pnode_lnum(const struct ubifs_info *c,
+                           struct ubifs_pnode *pnode)
 {
        int i, lnum;
@@ -1227,7 +1224,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
                err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
                if (err)
                        goto out;
-                err = unpack_nnode(c, buf, nnode);
+                err = ubifs_unpack_nnode(c, buf, nnode);
                if (err)
                        goto out;
        }
@@ -1816,7 +1813,7 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
                               c->nnode_sz);
                if (err)
                        return ERR_PTR(err);
-                err = unpack_nnode(c, buf, nnode);
+                err = ubifs_unpack_nnode(c, buf, nnode);
                if (err)
                        return ERR_PTR(err);
        }
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index a41434b42785..96ca95707175 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -320,6 +320,8 @@ no_space:
        dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
                "done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
        dbg_dump_lpt_info(c);
+        dbg_dump_lpt_lebs(c);
+        dump_stack();
        return err;
 }
@@ -546,8 +548,10 @@ static int write_cnodes(struct ubifs_info *c)
 no_space:
        ubifs_err("LPT out of space mismatch");
        dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
-                "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+                "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
        dbg_dump_lpt_info(c);
+        dbg_dump_lpt_lebs(c);
+        dump_stack();
        return err;
 }
@@ -749,7 +753,7 @@ static void lpt_tgc_start(struct ubifs_info *c)
 * LPT trivial garbage collection is where a LPT LEB contains only dirty and
 * free space and so may be reused as soon as the next commit is completed.
 * This function is called after the commit is completed (master node has been
- * written) and unmaps LPT LEBs that were marked for trivial GC.
+ * written) and un-maps LPT LEBs that were marked for trivial GC.
 */
 static int lpt_tgc_end(struct ubifs_info *c)
 {
@@ -1025,7 +1029,7 @@ static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
 * @c: UBIFS file-system description object
 * @node_type: LPT node type
 */
-static int get_lpt_node_len(struct ubifs_info *c, int node_type)
+static int get_lpt_node_len(const struct ubifs_info *c, int node_type)
 {
        switch (node_type) {
        case UBIFS_LPT_NNODE:
@@ -1046,7 +1050,7 @@ static int get_lpt_node_len(struct ubifs_info *c, int node_type)
 * @buf: buffer
 * @len: length of buffer
 */
-static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
+static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len)
 {
        int offs, pad_len;
@@ -1063,7 +1067,8 @@ static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
 * @buf: buffer
 * @node_num: node number is returned here
 */
-static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
+static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf,
+                             int *node_num)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int pos = 0, node_type;
@@ -1081,7 +1086,7 @@ static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
 *
 * This function returns %1 if the buffer contains a node or %0 if it does not.
 */
-static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
+static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len)
 {
        uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
        int pos = 0, node_type, node_len;
@@ -1105,7 +1110,6 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
        return 1;
 }
 /**
 * lpt_gc_lnum - garbage collect a LPT LEB.
 * @c: UBIFS file-system description object
@@ -1463,7 +1467,7 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
 #ifdef CONFIG_UBIFS_FS_DEBUG
 /**
- * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
+ * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
 * @buf: buffer
 * @len: buffer length
 */
@@ -1488,7 +1492,7 @@ static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
        struct ubifs_nnode *nnode;
        int hght;
-        /* Entire tree is in memory so first_nnode / next_nnode are ok */
+        /* Entire tree is in memory so first_nnode / next_nnode are OK */
        nnode = first_nnode(c, &hght);
        for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
                struct ubifs_nbranch *branch;
@@ -1602,7 +1606,10 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 {
        int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
        int ret;
-        void *buf = c->dbg_buf;
+        void *buf = c->dbg->buf;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
        dbg_lp("LEB %d", lnum);
        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
@@ -1704,6 +1711,9 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
        long long free = 0;
        int i;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
        for (i = 0; i < c->lpt_lebs; i++) {
                if (c->ltab[i].tgc || c->ltab[i].cmt)
                        continue;
@@ -1716,6 +1726,8 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
                dbg_err("LPT space error: free %lld lpt_sz %lld",
                        free, c->lpt_sz);
                dbg_dump_lpt_info(c);
+                dbg_dump_lpt_lebs(c);
+                dump_stack();
                return -EINVAL;
        }
        return 0;
@@ -1731,15 +1743,19 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
+        struct ubifs_debug_info *d = c->dbg;
        long long chk_lpt_sz, lpt_sz;
        int err = 0;
+        if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+                return 0;
        switch (action) {
        case 0:
-                c->chk_lpt_sz = 0;
+                d->chk_lpt_sz = 0;
-                c->chk_lpt_sz2 = 0;
+                d->chk_lpt_sz2 = 0;
-                c->chk_lpt_lebs = 0;
+                d->chk_lpt_lebs = 0;
-                c->chk_lpt_wastage = 0;
+                d->chk_lpt_wastage = 0;
                if (c->dirty_pn_cnt > c->pnode_cnt) {
                        dbg_err("dirty pnodes %d exceed max %d",
                                c->dirty_pn_cnt, c->pnode_cnt);
@@ -1752,35 +1768,35 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
                }
                return err;
        case 1:
-                c->chk_lpt_sz += len;
+                d->chk_lpt_sz += len;
                return 0;
        case 2:
-                c->chk_lpt_sz += len;
+                d->chk_lpt_sz += len;
-                c->chk_lpt_wastage += len;
+                d->chk_lpt_wastage += len;
-                c->chk_lpt_lebs += 1;
+                d->chk_lpt_lebs += 1;
                return 0;
        case 3:
                chk_lpt_sz = c->leb_size;
-                chk_lpt_sz *= c->chk_lpt_lebs;
+                chk_lpt_sz *= d->chk_lpt_lebs;
                chk_lpt_sz += len - c->nhead_offs;
-                if (c->chk_lpt_sz != chk_lpt_sz) {
+                if (d->chk_lpt_sz != chk_lpt_sz) {
                        dbg_err("LPT wrote %lld but space used was %lld",
-                                c->chk_lpt_sz, chk_lpt_sz);
+                                d->chk_lpt_sz, chk_lpt_sz);
                        err = -EINVAL;
                }
-                if (c->chk_lpt_sz > c->lpt_sz) {
+                if (d->chk_lpt_sz > c->lpt_sz) {
                        dbg_err("LPT wrote %lld but lpt_sz is %lld",
-                                c->chk_lpt_sz, c->lpt_sz);
+                                d->chk_lpt_sz, c->lpt_sz);
                        err = -EINVAL;
                }
-                if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) {
+                if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
                        dbg_err("LPT layout size %lld but wrote %lld",
-                                c->chk_lpt_sz, c->chk_lpt_sz2);
+                                d->chk_lpt_sz, d->chk_lpt_sz2);
                        err = -EINVAL;
                }
-                if (c->chk_lpt_sz2 && c->new_nhead_offs != len) {
+                if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
                        dbg_err("LPT new nhead offs: expected %d was %d",
-                                c->new_nhead_offs, len);
+                                d->new_nhead_offs, len);
                        err = -EINVAL;
                }
                lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
@@ -1788,26 +1804,146 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
                lpt_sz += c->ltab_sz;
                if (c->big_lpt)
                        lpt_sz += c->lsave_sz;
-                if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) {
+                if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
                        dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
-                                c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz);
+                                d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
                        err = -EINVAL;
                }
-                if (err)
+                if (err) {
                        dbg_dump_lpt_info(c);
-                c->chk_lpt_sz2 = c->chk_lpt_sz;
+                        dbg_dump_lpt_lebs(c);
-                c->chk_lpt_sz = 0;
+                        dump_stack();
-                c->chk_lpt_wastage = 0;
+                }
-                c->chk_lpt_lebs = 0;
+                d->chk_lpt_sz2 = d->chk_lpt_sz;
-                c->new_nhead_offs = len;
+                d->chk_lpt_sz = 0;
+                d->chk_lpt_wastage = 0;
+                d->chk_lpt_lebs = 0;
+                d->new_nhead_offs = len;
                return err;
        case 4:
-                c->chk_lpt_sz += len;
+                d->chk_lpt_sz += len;
-                c->chk_lpt_wastage += len;
+                d->chk_lpt_wastage += len;
                return 0;
        default:
                return -EINVAL;
        }
 }
+/**
+ * dbg_dump_lpt_leb - dump an LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to dump
+ *
+ * This function dumps an LEB from LPT area. Nodes in this area are very
+ * different to nodes in the main area (e.g., they do not have common headers,
+ * they do not have 8-byte alignments, etc), so we have a separate function to
+ * dump LPT area LEBs. Note, LPT has to be locked by the caller.
+ */
+static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
+{
+        int err, len = c->leb_size, node_type, node_num, node_len, offs;
+        void *buf = c->dbg->buf;
+        printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+               current->pid, lnum);
+        err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+        if (err) {
+                ubifs_err("cannot read LEB %d, error %d", lnum, err);
+                return;
+        }
+        while (1) {
+                offs = c->leb_size - len;
+                if (!is_a_node(c, buf, len)) {
+                        int pad_len;
+                        pad_len = get_pad_len(c, buf, len);
+                        if (pad_len) {
+                                printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
+                                       lnum, offs, pad_len);
+                                buf += pad_len;
+                                len -= pad_len;
+                                continue;
+                        }
+                        if (len)
+                                printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n",
+                                       lnum, offs, len);
+                        break;
+                }
+                node_type = get_lpt_node_type(c, buf, &node_num);
+                switch (node_type) {
+                case UBIFS_LPT_PNODE:
+                {
+                        node_len = c->pnode_sz;
+                        if (c->big_lpt)
+                                printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n",
+                                       lnum, offs, node_num);
+                        else
+                                printk(KERN_DEBUG "LEB %d:%d, pnode\n",
+                                       lnum, offs);
+                        break;
+                }
+                case UBIFS_LPT_NNODE:
+                {
+                        int i;
+                        struct ubifs_nnode nnode;
+                        node_len = c->nnode_sz;
+                        if (c->big_lpt)
+                                printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ",
+                                       lnum, offs, node_num);
+                        else
+                                printk(KERN_DEBUG "LEB %d:%d, nnode, ",
+                                       lnum, offs);
+                        err = ubifs_unpack_nnode(c, buf, &nnode);
+                        for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+                                printk("%d:%d", nnode.nbranch[i].lnum,
+                                       nnode.nbranch[i].offs);
+                                if (i != UBIFS_LPT_FANOUT - 1)
+                                        printk(", ");
+                        }
+                        printk("\n");
+                        break;
+                }
+                case UBIFS_LPT_LTAB:
+                        node_len = c->ltab_sz;
+                        printk(KERN_DEBUG "LEB %d:%d, ltab\n",
+                               lnum, offs);
+                        break;
+                case UBIFS_LPT_LSAVE:
+                        node_len = c->lsave_sz;
+                        printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs);
+                        break;
+                default:
+                        ubifs_err("LPT node type %d not recognized", node_type);
+                        return;
+                }
+                buf += node_len;
+                len -= node_len;
+        }
+        printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+               current->pid, lnum);
+}
+/**
+ * dbg_dump_lpt_lebs - dump LPT lebs.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps all LPT LEBs. The caller has to make sure the LPT is
+ * locked.
+ */
+void dbg_dump_lpt_lebs(const struct ubifs_info *c)
+{
+        int i;
+        printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n",
+               current->pid);
+        for (i = 0; i < c->lpt_lebs; i++)
+                dump_lpt_leb(c, i + c->lpt_first);
+        printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n",
+               current->pid);
+}
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9bd5a43d4526..9e6f403f170e 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -899,7 +899,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
        for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
                struct ubifs_scan_leb *sleb;
-                sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+                sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
                if (IS_ERR(sleb)) {
                        err = PTR_ERR(sleb);
                        break;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 21f7d047c306..ce42a7b0ca5a 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -144,7 +144,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
                /*
                 * If the replay order was perfect the dirty space would now be
                 * zero. The order is not perfect because the the journal heads
-                 * race with eachother. This is not a problem but is does mean
+                 * race with each other. This is not a problem but is does mean
                 * that the dirty space may temporarily exceed c->leb_size
                 * during the replay.
                 */
@@ -656,7 +656,7 @@ out_dump:
 * @dirty: amount of dirty space from padding and deletion nodes
 *
 * This function inserts a reference node to the replay tree and returns zero
- * in case of success ort a negative error code in case of failure.
+ * in case of success or a negative error code in case of failure.
 */
 static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
                           unsigned long long sqnum, int free, int dirty)
@@ -883,7 +883,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
                 * This means that we reached end of log and now
                 * look to the older log data, which was already
                 * committed but the eraseblock was not erased (UBIFS
-                 * only unmaps it). So this basically means we have to
+                 * only un-maps it). So this basically means we have to
                 * exit with "end of log" code.
                 */
                err = 1;
@@ -1062,6 +1062,15 @@ int ubifs_replay_journal(struct ubifs_info *c)
        if (err)
                goto out;
+        /*
+         * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
+         * to roughly estimate index growth. Things like @c->min_idx_lebs
+         * depend on it. This means we have to initialize it to make sure
+         * budgeting works properly.
+         */
+        c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+        c->budg_uncommitted_idx *= c->max_idx_node_sz;
        ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
        dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
                "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 0f392351dc5a..e070c643d1bb 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,6 +28,7 @@
 #include "ubifs.h"
 #include <linux/random.h>
+#include <linux/math64.h>
 /*
 * Default journal size in logical eraseblocks as a percent of total
@@ -80,7 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
        int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
        int min_leb_cnt = UBIFS_MIN_LEB_CNT;
-        uint64_t tmp64, main_bytes;
+        long long tmp64, main_bytes;
        __le64 tmp_le64;
        /* Some functions called from here depend on the @c->key_len filed */
@@ -160,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
        if (!sup)
                return -ENOMEM;
-        tmp64 = (uint64_t)max_buds * c->leb_size;
+        tmp64 = (long long)max_buds * c->leb_size;
        if (big_lpt)
                sup_flags |= UBIFS_FLG_BIGLPT;
@@ -179,14 +180,16 @@ static int create_default_filesystem(struct ubifs_info *c)
        sup->fanout        = cpu_to_le32(DEFAULT_FANOUT);
        sup->lsave_cnt     = cpu_to_le32(c->lsave_cnt);
        sup->fmt_version   = cpu_to_le32(UBIFS_FORMAT_VERSION);
-        sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
        sup->time_gran     = cpu_to_le32(DEFAULT_TIME_GRAN);
+        if (c->mount_opts.override_compr)
+                sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
+        else
+                sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
        generate_random_uuid(sup->uuid);
-        main_bytes = (uint64_t)main_lebs * c->leb_size;
+        main_bytes = (long long)main_lebs * c->leb_size;
-        tmp64 = main_bytes * DEFAULT_RP_PERCENT;
+        tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);
-        do_div(tmp64, 100);
        if (tmp64 > DEFAULT_MAX_RP_SIZE)
                tmp64 = DEFAULT_MAX_RP_SIZE;
        sup->rp_size = cpu_to_le64(tmp64);
@@ -582,16 +585,15 @@ int ubifs_read_superblock(struct ubifs_info *c)
        c->jhead_cnt     = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
        c->fanout        = le32_to_cpu(sup->fanout);
        c->lsave_cnt     = le32_to_cpu(sup->lsave_cnt);
-        c->default_compr = le16_to_cpu(sup->default_compr);
        c->rp_size       = le64_to_cpu(sup->rp_size);
        c->rp_uid        = le32_to_cpu(sup->rp_uid);
        c->rp_gid        = le32_to_cpu(sup->rp_gid);
        sup_flags        = le32_to_cpu(sup->flags);
+        if (!c->mount_opts.override_compr)
+                c->default_compr = le16_to_cpu(sup->default_compr);
        c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
        memcpy(&c->uuid, &sup->uuid, 16);
        c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
        /* Automatically increase file system size to the maximum size */
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f248533841a2..e7bab52a1410 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
 * @contention: if any contention, this is set to %1
 *
 * This function walks the list of mounted UBIFS file-systems and frees clean
- * znodes which are older then @age, until at least @nr znodes are freed.
+ * znodes which are older than @age, until at least @nr znodes are freed.
 * Returns the number of freed znodes.
 */
 static int shrink_tnc_trees(int nr, int age, int *contention)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d80b2aef42b6..89556ee72518 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -34,6 +34,8 @@
 #include <linux/parser.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
 #include "ubifs.h"
 /*
@@ -417,39 +419,61 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
        else if (c->mount_opts.chk_data_crc == 1)
                seq_printf(s, ",no_chk_data_crc");
+        if (c->mount_opts.override_compr) {
+                seq_printf(s, ",compr=");
+                seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+        }
        return 0;
 }
 static int ubifs_sync_fs(struct super_block *sb, int wait)
 {
+        int i, err;
        struct ubifs_info *c = sb->s_fs_info;
-        int i, ret = 0, err;
+        struct writeback_control wbc = {
-        long long bud_bytes;
+                .sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+                .range_start = 0,
+                .range_end   = LLONG_MAX,
+                .nr_to_write = LONG_MAX,
+        };
-        if (c->jheads) {
+        /*
-                for (i = 0; i < c->jhead_cnt; i++) {
+         * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
-                        err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+         * advisory thing to help the file system shove lots of data into the
-                        if (err && !ret)
+         * queues. If some gets missed then it'll be picked up on the second
-                                ret = err;
+         * '->sync_fs()' call, with non-zero @wait.
-                }
+         */
-                /* Commit the journal unless it has too little data */
+        if (sb->s_flags & MS_RDONLY)
-                spin_lock(&c->buds_lock);
+                return 0;
-                bud_bytes = c->bud_bytes;
-                spin_unlock(&c->buds_lock);
+        /*
-                if (bud_bytes > c->leb_size) {
+         * Synchronize write buffers, because 'ubifs_run_commit()' does not
-                        err = ubifs_run_commit(c);
+         * do this if it waits for an already running commit.
-                        if (err)
+         */
-                                return err;
+        for (i = 0; i < c->jhead_cnt; i++) {
-                }
+                err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+                if (err)
+                        return err;
        }
        /*
-         * We ought to call sync for c->ubi but it does not have one. If it had
+         * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
-         * it would in turn call mtd->sync, however mtd operations are
+         * pages, so synchronize them first, then commit the journal. Strictly
-         * synchronous anyway, so we don't lose any sleep here.
+         * speaking, it is not necessary to commit the journal here,
+         * synchronizing write-buffers would be enough. But committing makes
+         * UBIFS free space predictions much more accurate, so we want to let
+         * the user be able to get more accurate results of 'statfs()' after
+         * they synchronize the file system.
         */
-        return ret;
+        generic_sync_sb_inodes(sb, &wbc);
+        err = ubifs_run_commit(c);
+        if (err)
+                return err;
+        return ubi_sync(c->vi.ubi_num);
 }
 /**
@@ -596,7 +620,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
 }
 /*
- * init_constants_late - initialize UBIFS constants.
+ * init_constants_sb - initialize UBIFS constants.
 * @c: UBIFS file-system description object
 *
 * This is a helper function which initializes various UBIFS constants after
@@ -604,10 +628,10 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
 * makes sure they are all right. Returns zero in case of success and a
 * negative error code in case of failure.
 */
-static int init_constants_late(struct ubifs_info *c)
+static int init_constants_sb(struct ubifs_info *c)
 {
        int tmp, err;
-        uint64_t tmp64;
+        long long tmp64;
        c->main_bytes = (long long)c->main_lebs * c->leb_size;
        c->max_znode_sz = sizeof(struct ubifs_znode) +
@@ -634,9 +658,8 @@ static int init_constants_late(struct ubifs_info *c)
         * Make sure that the log is large enough to fit reference nodes for
         * all buds plus one reserved LEB.
         */
-        tmp64 = c->max_bud_bytes;
+        tmp64 = c->max_bud_bytes + c->leb_size - 1;
-        tmp = do_div(tmp64, c->leb_size);
+        c->max_bud_cnt = div_u64(tmp64, c->leb_size);
-        c->max_bud_cnt = tmp64 + !!tmp;
        tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
        tmp /= c->leb_size;
        tmp += 1;
@@ -672,7 +695,7 @@ static int init_constants_late(struct ubifs_info *c)
         * Consequently, if the journal is too small, UBIFS will treat it as
         * always full.
         */
-        tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
+        tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1;
        if (c->bg_bud_bytes < tmp64)
                c->bg_bud_bytes = tmp64;
        if (c->max_bud_bytes < tmp64 + c->leb_size)
@@ -682,6 +705,21 @@ static int init_constants_late(struct ubifs_info *c)
        if (err)
                return err;
+        return 0;
+}
+/*
+ * init_constants_master - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the master node has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right.
+ */
+static void init_constants_master(struct ubifs_info *c)
+{
+        long long tmp64;
        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        /*
@@ -690,14 +728,13 @@ static int init_constants_late(struct ubifs_info *c)
         * necessary to report something for the 'statfs()' call.
         *
         * Subtract the LEB reserved for GC, the LEB which is reserved for
-         * deletions, and assume only one journal head is available.
+         * deletions, minimum LEBs for the index, and assume only one journal
+         * head is available.
         */
-        tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
+        tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
-        tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
+        tmp64 *= (long long)c->leb_size - c->leb_overhead;
        tmp64 = ubifs_reported_space(c, tmp64);
        c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
-        return 0;
 }
 /**
@@ -878,6 +915,7 @@ static int check_volume_empty(struct ubifs_info *c)
 * Opt_no_bulk_read: disable bulk-reads
 * Opt_chk_data_crc: check CRCs when reading data nodes
 * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
+ * Opt_override_compr: override default compressor
 * Opt_err: just end of array marker
 */
 enum {
@@ -887,6 +925,7 @@ enum {
        Opt_no_bulk_read,
        Opt_chk_data_crc,
        Opt_no_chk_data_crc,
+        Opt_override_compr,
        Opt_err,
 };
@@ -897,6 +936,7 @@ static const match_table_t tokens = {
        {Opt_no_bulk_read, "no_bulk_read"},
        {Opt_chk_data_crc, "chk_data_crc"},
        {Opt_no_chk_data_crc, "no_chk_data_crc"},
+        {Opt_override_compr, "compr=%s"},
        {Opt_err, NULL},
 };
@@ -950,6 +990,28 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
                        c->mount_opts.chk_data_crc = 1;
                        c->no_chk_data_crc = 1;
                        break;
+                case Opt_override_compr:
+                {
+                        char *name = match_strdup(&args[0]);
+                        if (!name)
+                                return -ENOMEM;
+                        if (!strcmp(name, "none"))
+                                c->mount_opts.compr_type = UBIFS_COMPR_NONE;
+                        else if (!strcmp(name, "lzo"))
+                                c->mount_opts.compr_type = UBIFS_COMPR_LZO;
+                        else if (!strcmp(name, "zlib"))
+                                c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
+                        else {
+                                ubifs_err("unknown compressor \"%s\"", name);
+                                kfree(name);
+                                return -EINVAL;
+                        }
+                        kfree(name);
+                        c->mount_opts.override_compr = 1;
+                        c->default_compr = c->mount_opts.compr_type;
+                        break;
+                }
                default:
                        ubifs_err("unrecognized mount option \"%s\" "
                                  "or missing value", p);
@@ -1019,6 +1081,30 @@ again:
 }
 /**
+ * check_free_space - check if there is enough free space to mount.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free space to be mounted in
+ * read/write mode. UBIFS must always have some free space to allow deletions.
+ */
+static int check_free_space(struct ubifs_info *c)
+{
+        ubifs_assert(c->dark_wm > 0);
+        if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
+                ubifs_err("insufficient free space to mount in read/write mode");
+                dbg_dump_budg(c);
+                dbg_dump_lprops(c);
+                /*
+                 * We return %-EINVAL instead of %-ENOSPC because it seems to
+                 * be the closest error code mentioned in the mount function
+                 * documentation.
+                 */
+                return -EINVAL;
+        }
+        return 0;
+}
+/**
 * mount_ubifs - mount UBIFS file-system.
 * @c: UBIFS file-system description object
 *
@@ -1039,11 +1125,9 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                return err;
-#ifdef CONFIG_UBIFS_FS_DEBUG
+        err = ubifs_debugging_init(c);
-        c->dbg_buf = vmalloc(c->leb_size);
+        if (err)
-        if (!c->dbg_buf)
+                return err;
-                return -ENOMEM;
-#endif
        err = check_volume_empty(c);
        if (err)
@@ -1100,27 +1184,25 @@ static int mount_ubifs(struct ubifs_info *c)
                goto out_free;
        /*
-         * Make sure the compressor which is set as the default on in the
+         * Make sure the compressor which is set as default in the superblock
-         * superblock was actually compiled in.
+         * or overridden by mount options is actually compiled in.
         */
        if (!ubifs_compr_present(c->default_compr)) {
-                ubifs_warn("'%s' compressor is set by superblock, but not "
+                ubifs_err("'compressor \"%s\" is not compiled in",
-                           "compiled in", ubifs_compr_name(c->default_compr));
+                          ubifs_compr_name(c->default_compr));
-                c->default_compr = UBIFS_COMPR_NONE;
+                goto out_free;
        }
-        dbg_failure_mode_registration(c);
+        err = init_constants_sb(c);
-        err = init_constants_late(c);
        if (err)
-                goto out_dereg;
+                goto out_free;
        sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
        sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
        c->cbuf = kmalloc(sz, GFP_NOFS);
        if (!c->cbuf) {
                err = -ENOMEM;
-                goto out_dereg;
+                goto out_free;
        }
        sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
@@ -1145,6 +1227,8 @@ static int mount_ubifs(struct ubifs_info *c)
        if (err)
                goto out_master;
+        init_constants_master(c);
        if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
                ubifs_msg("recovery needed");
                c->need_recovery = 1;
@@ -1183,12 +1267,9 @@ static int mount_ubifs(struct ubifs_info *c)
        if (!mounted_read_only) {
                int lnum;
-                /* Check for enough free space */
+                err = check_free_space(c);
-                if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+                if (err)
-                        ubifs_err("insufficient available space");
-                        err = -EINVAL;
                        goto out_orphans;
-                }
                /* Check for enough log space */
                lnum = c->lhead_lnum + 1;
@@ -1232,6 +1313,10 @@ static int mount_ubifs(struct ubifs_info *c)
                }
        }
+        err = dbg_debugfs_init_fs(c);
+        if (err)
+                goto out_infos;
        err = dbg_check_filesystem(c);
        if (err)
                goto out_infos;
@@ -1283,8 +1368,20 @@ static int mount_ubifs(struct ubifs_info *c)
        dbg_msg("tree fanout:         %d", c->fanout);
        dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
        dbg_msg("first main LEB:      %d", c->main_first);
+        dbg_msg("max. znode size      %d", c->max_znode_sz);
+        dbg_msg("max. index node size %d", c->max_idx_node_sz);
+        dbg_msg("node sizes:          data %zu, inode %zu, dentry %zu",
+                UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
+        dbg_msg("node sizes:          trun %zu, sb %zu, master %zu",
+                UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
+        dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
+                UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
+        dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu",
+                UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+                UBIFS_MAX_DENT_NODE_SZ);
        dbg_msg("dead watermark:      %d", c->dead_wm);
        dbg_msg("dark watermark:      %d", c->dark_wm);
+        dbg_msg("LEB overhead:        %d", c->leb_overhead);
        x = (long long)c->main_lebs * c->dark_wm;
        dbg_msg("max. dark space:     %lld (%lld KiB, %lld MiB)",
                x, x >> 10, x >> 20);
@@ -1320,14 +1417,12 @@ out_wbufs:
        free_wbufs(c);
 out_cbuf:
        kfree(c->cbuf);
-out_dereg:
-        dbg_failure_mode_deregistration(c);
 out_free:
        kfree(c->bu.buf);
        vfree(c->ileb_buf);
        vfree(c->sbuf);
        kfree(c->bottom_up_buf);
-        UBIFS_DBG(vfree(c->dbg_buf));
+        ubifs_debugging_exit(c);
        return err;
 }
@@ -1345,6 +1440,7 @@ static void ubifs_umount(struct ubifs_info *c)
        dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
                c->vi.vol_id);
+        dbg_debugfs_exit_fs(c);
        spin_lock(&ubifs_infos_lock);
        list_del(&c->infos_list);
        spin_unlock(&ubifs_infos_lock);
@@ -1364,8 +1460,7 @@ static void ubifs_umount(struct ubifs_info *c)
        vfree(c->ileb_buf);
        vfree(c->sbuf);
        kfree(c->bottom_up_buf);
-        UBIFS_DBG(vfree(c->dbg_buf));
+        ubifs_debugging_exit(c);
-        dbg_failure_mode_deregistration(c);
 }
 /**
@@ -1387,12 +1482,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
        c->remounting_rw = 1;
        c->always_chk_crc = 1;
-        /* Check for enough free space */
+        err = check_free_space(c);
-        if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+        if (err)
-                ubifs_err("insufficient available space");
-                err = -EINVAL;
                goto out;
-        }
        if (c->old_leb_cnt != c->leb_cnt) {
                struct ubifs_sb_node *sup;
@@ -1515,20 +1607,24 @@ out:
 * @c: UBIFS file-system description object
 *
 * This function is called during un-mounting and re-mounting, and it commits
- * the journal unless the "fast unmount" mode is enabled. It also avoids
+ * the journal unless the "fast unmount" mode is enabled.
- * committing the journal if it contains too few data.
 */
 static void commit_on_unmount(struct ubifs_info *c)
 {
-        if (!c->fast_unmount) {
+        struct super_block *sb = c->vfs_sb;
-                long long bud_bytes;
+        long long bud_bytes;
-                spin_lock(&c->buds_lock);
+        /*
-                bud_bytes = c->bud_bytes;
+         * This function is called before the background thread is stopped, so
-                spin_unlock(&c->buds_lock);
+         * we may race with ongoing commit, which means we have to take
-                if (bud_bytes > c->leb_size)
+         * @c->bud_lock to access @c->bud_bytes.
-                        ubifs_run_commit(c);
+         */
-        }
+        spin_lock(&c->buds_lock);
+        bud_bytes = c->bud_bytes;
+        spin_unlock(&c->buds_lock);
+        if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
+                ubifs_run_commit(c);
 }
 /**
@@ -1849,7 +1945,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
                goto out_iput;
        mutex_unlock(&c->umount_mutex);
        return 0;
 out_iput:
@@ -1955,7 +2050,7 @@ static void ubifs_kill_sb(struct super_block *sb)
         * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
         * in order to be outside BKL.
         */
-        if (sb->s_root && !(sb->s_flags & MS_RDONLY))
+        if (sb->s_root)
                commit_on_unmount(c);
        /* The un-mount routine is actually done in put_super() */
        generic_shutdown_super(sb);
@@ -2021,6 +2116,14 @@ static int __init ubifs_init(void)
        BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
        /*
+         * We use 2 bit wide bit-fields to store compression type, which should
+         * be amended if more compressors are added. The bit-fields are:
+         * @compr_type in 'struct ubifs_inode', @default_compr in
+         * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'.
+         */
+        BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
+        /*
         * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
         * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
         */
@@ -2049,11 +2152,17 @@ static int __init ubifs_init(void)
        err = ubifs_compressors_init();
        if (err)
+                goto out_shrinker;
+        err = dbg_debugfs_init();
+        if (err)
                goto out_compr;
        return 0;
 out_compr:
+        ubifs_compressors_exit();
+out_shrinker:
        unregister_shrinker(&ubifs_shrinker_info);
        kmem_cache_destroy(ubifs_inode_slab);
 out_reg:
@@ -2068,6 +2177,7 @@ static void __exit ubifs_exit(void)
        ubifs_assert(list_empty(&ubifs_infos));
        ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+        dbg_debugfs_exit();
        ubifs_compressors_exit();
        unregister_shrinker(&ubifs_shrinker_info);
        kmem_cache_destroy(ubifs_inode_slab);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6eef5344a145..f7e36f545527 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2245,12 +2245,11 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
                        if (found) {
                                /* Ensure the znode is dirtied */
                                if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                                            znode = dirty_cow_bottom_up(c,
+                                        znode = dirty_cow_bottom_up(c, znode);
-                                                                        znode);
+                                        if (IS_ERR(znode)) {
-                                            if (IS_ERR(znode)) {
+                                                err = PTR_ERR(znode);
-                                                    err = PTR_ERR(znode);
+                                                goto out_unlock;
-                                                    goto out_unlock;
+                                        }
-                                            }
                                }
                                zbr = &znode->zbranch[n];
                                lnc_free(zbr);
@@ -2317,11 +2316,11 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
                /* Ensure the znode is dirtied */
                if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                            znode = dirty_cow_bottom_up(c, znode);
+                        znode = dirty_cow_bottom_up(c, znode);
-                            if (IS_ERR(znode)) {
+                        if (IS_ERR(znode)) {
-                                    err = PTR_ERR(znode);
+                                err = PTR_ERR(znode);
-                                    goto out_unlock;
+                                goto out_unlock;
-                            }
+                        }
                }
                if (found == 1) {
@@ -2627,11 +2626,11 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
                /* Ensure the znode is dirtied */
                if (znode->cnext || !ubifs_zn_dirty(znode)) {
-                            znode = dirty_cow_bottom_up(c, znode);
+                        znode = dirty_cow_bottom_up(c, znode);
-                            if (IS_ERR(znode)) {
+                        if (IS_ERR(znode)) {
-                                    err = PTR_ERR(znode);
+                                err = PTR_ERR(znode);
-                                    goto out_unlock;
+                                goto out_unlock;
-                            }
+                        }
                }
                /* Remove all keys in range except the first */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8ac76b1c2d55..fde8d127c768 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -553,8 +553,8 @@ static int layout_in_empty_space(struct ubifs_info *c)
        }
 #ifdef CONFIG_UBIFS_FS_DEBUG
-        c->new_ihead_lnum = lnum;
+        c->dbg->new_ihead_lnum = lnum;
-        c->new_ihead_offs = buf_offs;
+        c->dbg->new_ihead_offs = buf_offs;
 #endif
        return 0;
@@ -802,8 +802,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
         * budgeting subsystem to assume the index is already committed,
         * even though it is not.
         */
+        ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
        c->old_idx_sz = c->calc_idx_sz;
        c->budg_uncommitted_idx = 0;
+        c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
        spin_unlock(&c->space_lock);
        mutex_unlock(&c->tnc_mutex);
@@ -1002,7 +1004,8 @@ static int write_index(struct ubifs_info *c)
        }
 #ifdef CONFIG_UBIFS_FS_DEBUG
-        if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
+        if (lnum != c->dbg->new_ihead_lnum ||
+            buf_offs != c->dbg->new_ihead_offs) {
                ubifs_err("inconsistent ihead");
                return -EINVAL;
        }
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0b378042a3a2..b25fc36cf72f 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -51,6 +51,13 @@
 */
 #define UBIFS_MIN_COMPR_LEN 128
+/*
+ * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
+ * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * node uncompress, because it'll be read faster.
+ */
+#define UBIFS_MIN_COMPRESS_DIFF 64
 /* Root inode number */
 #define UBIFS_ROOT_INO 1
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 46b172560a06..fc2a4cc66d03 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -63,6 +63,14 @@
 #define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
 #define SQNUM_WATERMARK      0xFFFFFFFFFF000000ULL
+/*
+ * Minimum amount of LEBs reserved for the index. At present the index needs at
+ * least 2 LEBs: one for the index head and one for in-the-gaps method (which
+ * currently does not cater for the index head and so excludes it from
+ * consideration).
+ */
+#define MIN_INDEX_LEBS 2
 /* Minimum amount of data UBIFS writes to the flash */
 #define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
@@ -386,12 +394,12 @@ struct ubifs_inode {
        unsigned int dirty:1;
        unsigned int xattr:1;
        unsigned int bulk_read:1;
+        unsigned int compr_type:2;
        struct mutex ui_mutex;
        spinlock_t ui_lock;
        loff_t synced_i_size;
        loff_t ui_size;
        int flags;
-        int compr_type;
        pgoff_t last_page_read;
        pgoff_t read_in_a_row;
        int data_len;
@@ -419,7 +427,7 @@ struct ubifs_unclean_leb {
 *
 * LPROPS_UNCAT: not categorized
 * LPROPS_DIRTY: dirty > 0, not index
- * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
+ * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
 * LPROPS_FREE: free > 0, not empty, not index
 * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
 * LPROPS_EMPTY: LEB is empty, not taken
@@ -473,8 +481,8 @@ struct ubifs_lprops {
 struct ubifs_lpt_lprops {
        int free;
        int dirty;
-        unsigned tgc : 1;
+        unsigned tgc:1;
-        unsigned cmt : 1;
+        unsigned cmt:1;
 };
 /**
@@ -482,24 +490,26 @@ struct ubifs_lpt_lprops {
 * @empty_lebs: number of empty LEBs
 * @taken_empty_lebs: number of taken LEBs
 * @idx_lebs: number of indexing LEBs
- * @total_free: total free space in bytes
+ * @total_free: total free space in bytes (includes all LEBs)
- * @total_dirty: total dirty space in bytes
+ * @total_dirty: total dirty space in bytes (includes all LEBs)
- * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_used: total used space in bytes (does not include index LEBs)
- * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (does not include index LEBs)
- * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (does not include index LEBs)
+ *
+ * The @taken_empty_lebs field counts the LEBs that are in the transient state
+ * of having been "taken" for use but not yet written to. @taken_empty_lebs is
+ * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be
+ * used by itself (in which case 'unused_lebs' would be a better name). In the
+ * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained
+ * by GC, but unlike other empty LEBs that are "taken", it may not be written
+ * straight away (i.e. before the next commit start or unmount), so either
+ * @gc_lnum must be specially accounted for, or the current approach followed
+ * i.e. count it under @taken_empty_lebs.
 *
- * N.B. total_dirty and total_used are different to other total_* fields,
+ * @empty_lebs includes @taken_empty_lebs.
- * because they account _all_ LEBs, not just data LEBs.
 *
- * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
+ * @total_used, @total_dead and @total_dark fields do not account indexing
- * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
+ * LEBs.
- * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
- * by itself (in which case 'unused_lebs' would be a better name). In the case
- * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
- * but unlike other empty LEBs that are 'taken', it may not be written straight
- * away (i.e. before the next commit start or unmount), so either gc_lnum must
- * be specially accounted for, or the current approach followed i.e. count it
- * under 'taken_empty_lebs'.
 */
 struct ubifs_lp_stats {
        int empty_lebs;
@@ -893,15 +903,25 @@ struct ubifs_orphan {
 /**
 * struct ubifs_mount_opts - UBIFS-specific mount options information.
 * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
- * @bulk_read: enable bulk-reads
+ * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable)
- * @chk_data_crc: check CRCs when reading data nodes
+ * @chk_data_crc: enable/disable CRC data checking when reading data nodes
+ *                (%0 default, %1 disabe, %2 enable)
+ * @override_compr: override default compressor (%0 - do not override and use
+ *                  superblock compressor, %1 - override and use compressor
+ *                  specified in @compr_type)
+ * @compr_type: compressor type to override the superblock compressor with
+ *              (%UBIFS_COMPR_NONE, etc)
 */
 struct ubifs_mount_opts {
        unsigned int unmount_mode:2;
        unsigned int bulk_read:2;
        unsigned int chk_data_crc:2;
+        unsigned int override_compr:1;
+        unsigned int compr_type:2;
 };
+struct ubifs_debug_info;
 /**
 * struct ubifs_info - UBIFS file-system description data structure
 * (per-superblock).
@@ -946,6 +966,7 @@ struct ubifs_mount_opts {
 * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
 *                   recovery)
 * @bulk_read: enable bulk-reads
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
 *
 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
 *             @calc_idx_sz
@@ -963,8 +984,6 @@ struct ubifs_mount_opts {
 * @ileb_nxt: next pre-allocated index LEBs
 * @old_idx: tree of index nodes obsoleted since the last commit start
 * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
- * @new_ihead_lnum: used by debugging to check ihead_lnum
- * @new_ihead_offs: used by debugging to check ihead_offs
 *
 * @mst_node: master node
 * @mst_offs: offset of valid master node
@@ -986,7 +1005,6 @@ struct ubifs_mount_opts {
 * @main_lebs: count of LEBs in the main area
 * @main_first: first LEB of the main area
 * @main_bytes: main area size in bytes
- * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
 *
 * @key_hash_type: type of the key hash
 * @key_hash: direntry key hash function
@@ -1149,15 +1167,7 @@ struct ubifs_mount_opts {
 * @always_chk_crc: always check CRCs (while mounting and remounting rw)
 * @mount_opts: UBIFS-specific mount options
 *
- * @dbg_buf: a buffer of LEB size used for debugging purposes
+ * @dbg: debugging-related information
- * @old_zroot: old index root - used by 'dbg_check_old_index()'
- * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
- * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
- * @failure_mode: failure mode for recovery testing
- * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
- * @fail_timeout: time in jiffies when delay of failure mode expires
- * @fail_cnt: current number of calls to failure mode I/O functions
- * @fail_cnt_max: number of calls by which to delay failure mode
 */
 struct ubifs_info {
        struct super_block *vfs_sb;
@@ -1196,6 +1206,7 @@ struct ubifs_info {
        unsigned int big_lpt:1;
        unsigned int no_chk_data_crc:1;
        unsigned int bulk_read:1;
+        unsigned int default_compr:2;
        struct mutex tnc_mutex;
        struct ubifs_zbranch zroot;
@@ -1212,10 +1223,6 @@ struct ubifs_info {
        int ileb_nxt;
        struct rb_root old_idx;
        int *bottom_up_buf;
-#ifdef CONFIG_UBIFS_FS_DEBUG
-        int new_ihead_lnum;
-        int new_ihead_offs;
-#endif
        struct ubifs_mst_node *mst_node;
        int mst_offs;
@@ -1237,7 +1244,6 @@ struct ubifs_info {
        int main_lebs;
        int main_first;
        long long main_bytes;
-        int default_compr;
        uint8_t key_hash_type;
        uint32_t (*key_hash)(const char *str, int len);
@@ -1315,8 +1321,8 @@ struct ubifs_info {
        void *sbuf;
        struct list_head idx_gc;
        int idx_gc_cnt;
-        volatile int gc_seq;
+        int gc_seq;
-        volatile int gced_lnum;
+        int gced_lnum;
        struct list_head infos_list;
        struct mutex umount_mutex;
@@ -1391,21 +1397,7 @@ struct ubifs_info {
        struct ubifs_mount_opts mount_opts;
 #ifdef CONFIG_UBIFS_FS_DEBUG
-        void *dbg_buf;
+        struct ubifs_debug_info *dbg;
-        struct ubifs_zbranch old_zroot;
-        int old_zroot_level;
-        unsigned long long old_zroot_sqnum;
-        int failure_mode;
-        int fail_delay;
-        unsigned long fail_timeout;
-        unsigned int fail_cnt;
-        unsigned int fail_cnt_max;
-        long long chk_lpt_sz;
-        long long chk_lpt_sz2;
-        long long chk_lpt_wastage;
-        int chk_lpt_lebs;
-        int new_nhead_lnum;
-        int new_nhead_offs;
 #endif
 };
@@ -1505,7 +1497,7 @@ void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
 long long ubifs_get_free_space(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
+long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 /* find.c */
@@ -1639,6 +1631,9 @@ void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
 void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
 uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
 struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+/* Needed only in debugging code in lpt_commit.c */
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+                       struct ubifs_nnode *nnode);
 /* lpt_commit.c */
 int ubifs_lpt_start_commit(struct ubifs_info *c);
@@ -1714,7 +1709,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 /* compressor.c */
 int __init ubifs_compressors_init(void);
-void __exit ubifs_compressors_exit(void);
+void ubifs_compressors_exit(void);
 void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
                    int *compr_type);
 int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e66531..237804cd6b56 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
        if (error)
                return error;
        error = -EOPNOTSUPP;
-        if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+        if (d->d_inode->i_op->listxattr) {
                error = d->d_inode->i_op->listxattr(d, list, size);
        } else {
                error = security_inode_listsecurity(d->d_inode, list, size);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 737c9a425361..c3dc491fff89 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -85,13 +85,13 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_trans_inode.o \
                                   xfs_trans_item.o \
                                   xfs_utils.o \
-                                   xfs_vfsops.o \
                                   xfs_vnodeops.o \
                                   xfs_rw.o \
                                   xfs_dmops.o \
                                   xfs_qmops.o
-xfs-$(CONFIG_XFS_TRACE)         += xfs_dir2_trace.o
+xfs-$(CONFIG_XFS_TRACE)         += xfs_btree_trace.o \
+                                   xfs_dir2_trace.o
 # Objects in linux/
 xfs-y                           += $(addprefix $(XFS_LINUX)/, \
@@ -106,7 +106,7 @@ xfs-y				+= $(addprefix $(XFS_LINUX)/, \
                                   xfs_iops.o \
                                   xfs_lrw.o \
                                   xfs_super.o \
-                                   xfs_vnode.o \
+                                   xfs_sync.o \
                                   xfs_xattr.o)
 # Objects in support/
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
index 351a8f454bd1..4dfc7c370819 100644
--- a/fs/xfs/linux-2.6/sv.h
+++ b/fs/xfs/linux-2.6/sv.h
@@ -32,23 +32,15 @@ typedef struct sv_s {
        wait_queue_head_t waiters;
 } sv_t;
-#define SV_FIFO         0x0             /* sv_t is FIFO type */
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
-#define SV_LIFO         0x2             /* sv_t is LIFO type */
-#define SV_PRIO         0x4             /* sv_t is PRIO type */
-#define SV_KEYED        0x6             /* sv_t is KEYED type */
-#define SV_DEFAULT      SV_FIFO
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
-                             unsigned long timeout)
 {
        DECLARE_WAITQUEUE(wait, current);
        add_wait_queue_exclusive(&sv->waiters, &wait);
-        __set_current_state(state);
+        __set_current_state(TASK_UNINTERRUPTIBLE);
        spin_unlock(lock);
-        schedule_timeout(timeout);
+        schedule();
        remove_wait_queue(&sv->waiters, &wait);
 }
@@ -58,13 +50,7 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
 #define sv_destroy(sv) \
        /*NOTHING*/
 #define sv_wait(sv, pri, lock, s) \
-        _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
+        _sv_wait(sv, lock)
-#define sv_wait_sig(sv, pri, lock, s)   \
-        _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
-        _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
-        _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
 #define sv_signal(sv) \
        wake_up(&(sv)->waiters)
 #define sv_broadcast(sv) \
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a44d68eb50b5..de3a198f771e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -42,6 +42,40 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+/*
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC          37
+#define to_ioend_wq(v)  (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
+static wait_queue_head_t xfs_ioend_wq[NVSYNC];
+void __init
+xfs_ioend_init(void)
+{
+        int i;
+        for (i = 0; i < NVSYNC; i++)
+                init_waitqueue_head(&xfs_ioend_wq[i]);
+}
+void
+xfs_ioend_wait(
+        xfs_inode_t     *ip)
+{
+        wait_queue_head_t *wq = to_ioend_wq(ip);
+        wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
+}
+STATIC void
+xfs_ioend_wake(
+        xfs_inode_t     *ip)
+{
+        if (atomic_dec_and_test(&ip->i_iocount))
+                wake_up(to_ioend_wq(ip));
+}
 STATIC void
 xfs_count_page_state(
        struct page             *page,
@@ -146,16 +180,25 @@ xfs_destroy_ioend(
        xfs_ioend_t             *ioend)
 {
        struct buffer_head      *bh, *next;
+        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
        for (bh = ioend->io_buffer_head; bh; bh = next) {
                next = bh->b_private;
                bh->b_end_io(bh, !ioend->io_error);
        }
-        if (unlikely(ioend->io_error)) {
-                vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error,
+        /*
-                                __FILE__,__LINE__);
+         * Volume managers supporting multiple paths can send back ENODEV
+         * when the final path disappears.  In this case continuing to fill
+         * the page cache with dirty data which cannot be written out is
+         * evil, so prevent that.
+         */
+        if (unlikely(ioend->io_error == -ENODEV)) {
+                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
+                                      __FILE__, __LINE__);
        }
-        vn_iowake(XFS_I(ioend->io_inode));
+        xfs_ioend_wake(ip);
        mempool_free(ioend, xfs_ioend_pool);
 }
@@ -191,7 +234,7 @@ xfs_setfilesize(
                ip->i_d.di_size = isize;
                ip->i_update_core = 1;
                ip->i_update_size = 1;
-                mark_inode_dirty_sync(ioend->io_inode);
+                xfs_mark_inode_dirty_sync(ip);
        }
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -317,14 +360,9 @@ xfs_map_blocks(
        xfs_iomap_t             *mapp,
        int                     flags)
 {
-        xfs_inode_t             *ip = XFS_I(inode);
+        int                     nmaps = 1;
-        int                     error, nmaps = 1;
+        return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
-        error = xfs_iomap(ip, offset, count,
-                                flags, mapp, &nmaps);
-        if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
-                xfs_iflags_set(ip, XFS_IMODIFIED);
-        return -error;
 }
 STATIC_INLINE int
@@ -512,7 +550,7 @@ xfs_cancel_ioend(
                        unlock_buffer(bh);
                } while ((bh = next_bh) != NULL);
-                vn_iowake(XFS_I(ioend->io_inode));
+                xfs_ioend_wake(XFS_I(ioend->io_inode));
                mempool_free(ioend, xfs_ioend_pool);
        } while ((ioend = next) != NULL);
 }
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 3ba0631a3818..7b26f5ff9692 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -43,4 +43,7 @@ typedef struct xfs_ioend {
 extern const struct address_space_operations xfs_address_space_operations;
 extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+extern void xfs_ioend_init(void);
+extern void xfs_ioend_wait(struct xfs_inode *);
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 36d5fcd3f593..cb329edc925b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -630,6 +630,29 @@ xfs_buf_get_flags(
        return NULL;
 }
+STATIC int
+_xfs_buf_read(
+        xfs_buf_t               *bp,
+        xfs_buf_flags_t         flags)
+{
+        int                     status;
+        XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
+        ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
+                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        status = xfs_buf_iorequest(bp);
+        if (!status && !(flags & XBF_ASYNC))
+                status = xfs_buf_iowait(bp);
+        return status;
+}
 xfs_buf_t *
 xfs_buf_read_flags(
        xfs_buftarg_t           *target,
@@ -646,7 +669,7 @@ xfs_buf_read_flags(
                if (!XFS_BUF_ISDONE(bp)) {
                        XB_TRACE(bp, "read", (unsigned long)flags);
                        XFS_STATS_INC(xb_get_read);
-                        xfs_buf_iostart(bp, flags);
+                        _xfs_buf_read(bp, flags);
                } else if (flags & XBF_ASYNC) {
                        XB_TRACE(bp, "read_async", (unsigned long)flags);
                        /*
@@ -1048,50 +1071,39 @@ xfs_buf_ioerror(
        XB_TRACE(bp, "ioerror", (unsigned long)error);
 }
-/*
- *      Initiate I/O on a buffer, based on the flags supplied.
- *      The b_iodone routine in the buffer supplied will only be called
- *      when all of the subsidiary I/O requests, if any, have been completed.
- */
 int
-xfs_buf_iostart(
+xfs_bawrite(
-        xfs_buf_t               *bp,
+        void                    *mp,
-        xfs_buf_flags_t         flags)
+        struct xfs_buf          *bp)
 {
-        int                     status = 0;
+        XB_TRACE(bp, "bawrite", 0);
-        XB_TRACE(bp, "iostart", (unsigned long)flags);
+        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
-        if (flags & XBF_DELWRI) {
+        xfs_buf_delwri_dequeue(bp);
-                bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
-                bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
-                xfs_buf_delwri_queue(bp, 1);
-                return 0;
-        }
-        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+        bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
-                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-        bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
-                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+        bp->b_mount = mp;
+        bp->b_strat = xfs_bdstrat_cb;
+        return xfs_bdstrat_cb(bp);
+}
-        BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
+void
+xfs_bdwrite(
+        void                    *mp,
+        struct xfs_buf          *bp)
+{
+        XB_TRACE(bp, "bdwrite", 0);
-        /* For writes allow an alternate strategy routine to precede
+        bp->b_strat = xfs_bdstrat_cb;
-         * the actual I/O request (which may not be issued at all in
+        bp->b_mount = mp;
-         * a shutdown situation, for example).
-         */
-        status = (flags & XBF_WRITE) ?
-                xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
-        /* Wait for I/O if we are not an async request.
+        bp->b_flags &= ~XBF_READ;
-         * Note: async I/O request completion will release the buffer,
+        bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
-         * and that can already be done by this point.  So using the
-         * buffer pointer from here on, after async I/O, is invalid.
-         */
-        if (!status && !(flags & XBF_ASYNC))
-                status = xfs_buf_iowait(bp);
-        return status;
+        xfs_buf_delwri_queue(bp, 1);
 }
 STATIC_INLINE void
@@ -1114,8 +1126,7 @@ xfs_buf_bio_end_io(
        unsigned int            blocksize = bp->b_target->bt_bsize;
        struct bio_vec          *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+        xfs_buf_ioerror(bp, -error);
-                bp->b_error = EIO;
        do {
                struct page     *page = bvec->bv_page;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 456519a088c7..288ae7c4c800 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -168,7 +168,7 @@ typedef struct xfs_buf {
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
-        void                    *b_fspriv3;
+        struct xfs_mount        *b_mount;
        unsigned short          b_error;        /* error code on I/O */
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
@@ -214,9 +214,10 @@ extern void xfs_buf_lock(xfs_buf_t *);
 extern void xfs_buf_unlock(xfs_buf_t *);
 /* Buffer Read and Write Routines */
+extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
+extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 extern void xfs_buf_ioend(xfs_buf_t *,  int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
-extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
@@ -311,10 +312,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_UNORDERED(bp)   ((bp)->b_flags &= ~XBF_ORDERED)
 #define XFS_BUF_ISORDERED(bp)   ((bp)->b_flags & XBF_ORDERED)
-#define XFS_BUF_SHUT(bp)        do { } while (0)
-#define XFS_BUF_UNSHUT(bp)      do { } while (0)
-#define XFS_BUF_ISSHUT(bp)      (0)
 #define XFS_BUF_HOLD(bp)        xfs_buf_hold(bp)
 #define XFS_BUF_READ(bp)        ((bp)->b_flags |= XBF_READ)
 #define XFS_BUF_UNREAD(bp)      ((bp)->b_flags &= ~XBF_READ)
@@ -334,8 +331,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_SET_FSPRIVATE(bp, val)          ((bp)->b_fspriv = (void*)(val))
 #define XFS_BUF_FSPRIVATE2(bp, type)            ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)         ((bp)->b_fspriv2 = (void*)(val))
-#define XFS_BUF_FSPRIVATE3(bp, type)            ((type)(bp)->b_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(bp, val)         ((bp)->b_fspriv3 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                   do { } while (0)
 #define XFS_BUF_SET_BRELSE_FUNC(bp, func)       ((bp)->b_relse = (func))
@@ -366,14 +361,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_TARGET(bp)              ((bp)->b_target)
 #define XFS_BUFTARG_NAME(target)        xfs_buf_target_name(target)
-static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
-{
-        bp->b_fspriv3 = mp;
-        bp->b_strat = xfs_bdstrat_cb;
-        xfs_buf_delwri_dequeue(bp);
-        return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-}
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
        if (!bp->b_relse)
@@ -414,17 +401,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
        return error;
 }
-/*
- * No error can be returned from xfs_buf_iostart for delwri
- * buffers as they are queued and no I/O is issued.
- */
-static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
-{
-        bp->b_strat = xfs_bdstrat_cb;
-        bp->b_fspriv3 = mp;
-        (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
-}
 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
 #define xfs_iowait(bp)  xfs_buf_iowait(bp)
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 8c022cd0ad67..55bddf3b6091 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -25,12 +25,4 @@
 */
 typedef const struct cred cred_t;
-extern cred_t *sys_cred;
-/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static inline int capable_cred(cred_t *cr, int cid)
-{
-        return (cr == sys_cred) ? 1 : capable(cid);
-}
 #endif  /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 7f7abec25e14..595751f78350 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,7 +29,6 @@
 #include "xfs_vnodeops.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
-#include "xfs_vfsops.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 3fee790f138b..e14c4e3aea0c 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -36,89 +36,54 @@
 #include "xfs_inode.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_ioctl32.h"
 #include "xfs_vnodeops.h"
+#include "xfs_da_btree.h"
+#include "xfs_ioctl.h"
 #include <linux/dcache.h>
 #include <linux/smp_lock.h>
 static struct vm_operations_struct xfs_file_vm_ops;
-STATIC_INLINE ssize_t
+STATIC ssize_t
-__xfs_file_read(
+xfs_file_aio_read(
        struct kiocb            *iocb,
        const struct iovec      *iov,
        unsigned long           nr_segs,
-        int                     ioflags,
        loff_t                  pos)
 {
        struct file             *file = iocb->ki_filp;
+        int                     ioflags = IO_ISAIO;
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
+        if (file->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
        return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
                                nr_segs, &iocb->ki_pos, ioflags);
 }
 STATIC ssize_t
-xfs_file_aio_read(
+xfs_file_aio_write(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-STATIC ssize_t
-xfs_file_aio_read_invis(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-STATIC_INLINE ssize_t
-__xfs_file_write(
        struct kiocb            *iocb,
        const struct iovec      *iov,
        unsigned long           nr_segs,
-        int                     ioflags,
        loff_t                  pos)
 {
-        struct file     *file = iocb->ki_filp;
+        struct file             *file = iocb->ki_filp;
+        int                     ioflags = IO_ISAIO;
        BUG_ON(iocb->ki_pos != pos);
        if (unlikely(file->f_flags & O_DIRECT))
                ioflags |= IO_ISDIRECT;
+        if (file->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
        return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
                                &iocb->ki_pos, ioflags);
 }
 STATIC ssize_t
-xfs_file_aio_write(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-STATIC ssize_t
-xfs_file_aio_write_invis(
-        struct kiocb            *iocb,
-        const struct iovec      *iov,
-        unsigned long           nr_segs,
-        loff_t                  pos)
-{
-        return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-STATIC ssize_t
 xfs_file_splice_read(
        struct file             *infilp,
        loff_t                  *ppos,
@@ -126,20 +91,13 @@ xfs_file_splice_read(
        size_t                  len,
        unsigned int            flags)
 {
-        return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
+        int                     ioflags = 0;
-                                   infilp, ppos, pipe, len, flags, 0);
-}
+        if (infilp->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
-STATIC ssize_t
-xfs_file_splice_read_invis(
-        struct file             *infilp,
-        loff_t                  *ppos,
-        struct pipe_inode_info  *pipe,
-        size_t                  len,
-        unsigned int            flags)
-{
        return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
-                                   infilp, ppos, pipe, len, flags, IO_INVIS);
+                                   infilp, ppos, pipe, len, flags, ioflags);
 }
 STATIC ssize_t
@@ -150,30 +108,49 @@ xfs_file_splice_write(
        size_t                  len,
        unsigned int            flags)
 {
-        return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
+        int                     ioflags = 0;
-                                    pipe, outfilp, ppos, len, flags, 0);
-}
+        if (outfilp->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
-STATIC ssize_t
-xfs_file_splice_write_invis(
-        struct pipe_inode_info  *pipe,
-        struct file             *outfilp,
-        loff_t                  *ppos,
-        size_t                  len,
-        unsigned int            flags)
-{
        return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
-                                    pipe, outfilp, ppos, len, flags, IO_INVIS);
+                                    pipe, outfilp, ppos, len, flags, ioflags);
 }
 STATIC int
 xfs_file_open(
        struct inode    *inode,
-        struct file     *filp)
+        struct file     *file)
 {
-        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+        if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EFBIG;
-        return -xfs_open(XFS_I(inode));
+        if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+                return -EIO;
+        return 0;
+}
+STATIC int
+xfs_dir_open(
+        struct inode    *inode,
+        struct file     *file)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        int             mode;
+        int             error;
+        error = xfs_file_open(inode, file);
+        if (error)
+                return error;
+        /*
+         * If there are any blocks, read-ahead block 0 as we're almost
+         * certain to have the next operation be a read there.
+         */
+        mode = xfs_ilock_map_shared(ip);
+        if (ip->i_d.di_nextents > 0)
+                xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+        xfs_iunlock(ip, mode);
+        return 0;
 }
 STATIC int
@@ -227,7 +204,7 @@ xfs_file_readdir(
         * point we can change the ->readdir prototype to include the
         * buffer size.
         */
-        bufsize = (size_t)min_t(loff_t, PAGE_SIZE, inode->i_size);
+        bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size);
        error = xfs_readdir(ip, dirent, bufsize,
                                (xfs_off_t *)&filp->f_pos, filldir);
@@ -248,48 +225,6 @@ xfs_file_mmap(
        return 0;
 }
-STATIC long
-xfs_file_ioctl(
-        struct file     *filp,
-        unsigned int    cmd,
-        unsigned long   p)
-{
-        int             error;
-        struct inode    *inode = filp->f_path.dentry->d_inode;
-        error = xfs_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p);
-        xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-        /* NOTE:  some of the ioctl's return positive #'s as a
-         *        byte count indicating success, such as
-         *        readlink_by_handle.  So we don't "sign flip"
-         *        like most other routines.  This means true
-         *        errors need to be returned as a negative value.
-         */
-        return error;
-}
-STATIC long
-xfs_file_ioctl_invis(
-        struct file     *filp,
-        unsigned int    cmd,
-        unsigned long   p)
-{
-        int             error;
-        struct inode    *inode = filp->f_path.dentry->d_inode;
-        error = xfs_ioctl(XFS_I(inode), filp, IO_INVIS, cmd, (void __user *)p);
-        xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-        /* NOTE:  some of the ioctl's return positive #'s as a
-         *        byte count indicating success, such as
-         *        readlink_by_handle.  So we don't "sign flip"
-         *        like most other routines.  This means true
-         *        errors need to be returned as a negative value.
-         */
-        return error;
-}
 /*
 * mmap()d file has taken write protection fault and is being made
 * writable. We can set the page state up correctly for a writable
@@ -325,26 +260,8 @@ const struct file_operations xfs_file_operations = {
 #endif
 };
-const struct file_operations xfs_invis_file_operations = {
-        .llseek         = generic_file_llseek,
-        .read           = do_sync_read,
-        .write          = do_sync_write,
-        .aio_read       = xfs_file_aio_read_invis,
-        .aio_write      = xfs_file_aio_write_invis,
-        .splice_read    = xfs_file_splice_read_invis,
-        .splice_write   = xfs_file_splice_write_invis,
-        .unlocked_ioctl = xfs_file_ioctl_invis,
-#ifdef CONFIG_COMPAT
-        .compat_ioctl   = xfs_file_compat_invis_ioctl,
-#endif
-        .mmap           = xfs_file_mmap,
-        .open           = xfs_file_open,
-        .release        = xfs_file_release,
-        .fsync          = xfs_file_fsync,
-};
 const struct file_operations xfs_dir_file_operations = {
+        .open           = xfs_dir_open,
        .read           = generic_read_dir,
        .readdir        = xfs_file_readdir,
        .llseek         = generic_file_llseek,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 36caa6d957df..5aeb77776961 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -24,6 +24,10 @@ int  fs_noerr(void) { return 0; }
 int  fs_nosys(void) { return ENOSYS; }
 void fs_noval(void) { return; }
+/*
+ * note: all filemap functions return negative error codes. These
+ * need to be inverted before returning to the xfs core functions.
+ */
 void
 xfs_tosspages(
        xfs_inode_t     *ip,
@@ -53,7 +57,7 @@ xfs_flushinval_pages(
                if (!ret)
                        truncate_inode_pages(mapping, first);
        }
-        return ret;
+        return -ret;
 }
 int
@@ -72,10 +76,23 @@ xfs_flush_pages(
                xfs_iflags_clear(ip, XFS_ITRUNCATED);
                ret = filemap_fdatawrite(mapping);
                if (flags & XFS_B_ASYNC)
-                        return ret;
+                        return -ret;
                ret2 = filemap_fdatawait(mapping);
                if (!ret)
                        ret = ret2;
        }
-        return ret;
+        return -ret;
+}
+int
+xfs_wait_on_pages(
+        xfs_inode_t     *ip,
+        xfs_off_t       first,
+        xfs_off_t       last)
+{
+        struct address_space *mapping = VFS_I(ip)->i_mapping;
+        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+                return -filemap_fdatawait(mapping);
+        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index ef90e64641e6..2ae8b1ccb02e 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -26,7 +26,6 @@
 */
 xfs_param_t xfs_params = {
                          /*    MIN             DFLT            MAX     */
-        .restrict_chown = {     0,              1,              1       },
        .sgid_inherit   = {     0,              0,              1       },
        .symlink_mode   = {     0,              0,              1       },
        .panic_mask     = {     0,              0,              255     },
@@ -43,10 +42,3 @@ xfs_param_t xfs_params = {
        .inherit_nodfrg = {     0,              1,              1       },
        .fstrm_timer    = {     1,              30*100,         3600*100},
 };
-/*
- * Global system credential structure.
- */
-static cred_t sys_cred_val;
-cred_t *sys_cred = &sys_cred_val;
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
index 6eda8a3eb6f1..69f71caf061c 100644
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ b/fs/xfs/linux-2.6/xfs_globals.h
@@ -19,6 +19,5 @@
 #define __XFS_GLOBALS_H__
 extern uint64_t xfs_panic_mask;         /* set to cause more panics */
-extern cred_t *sys_cred;
 #endif  /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 281cbd5a25cf..e5be1e0be802 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -68,26 +68,22 @@
 * XFS_IOC_PATH_TO_HANDLE
 *    returns full handle for a path
 */
-STATIC int
+int
 xfs_find_handle(
        unsigned int            cmd,
-        void                    __user *arg)
+        xfs_fsop_handlereq_t    *hreq)
 {
        int                     hsize;
        xfs_handle_t            handle;
-        xfs_fsop_handlereq_t    hreq;
        struct inode            *inode;
-        if (copy_from_user(&hreq, arg, sizeof(hreq)))
-                return -XFS_ERROR(EFAULT);
        memset((char *)&handle, 0, sizeof(handle));
        switch (cmd) {
        case XFS_IOC_PATH_TO_FSHANDLE:
        case XFS_IOC_PATH_TO_HANDLE: {
                struct path path;
-                int error = user_lpath((const char __user *)hreq.path, &path);
+                int error = user_lpath((const char __user *)hreq->path, &path);
                if (error)
                        return error;
@@ -101,7 +97,7 @@ xfs_find_handle(
        case XFS_IOC_FD_TO_HANDLE: {
                struct file     *file;
-                file = fget(hreq.fd);
+                file = fget(hreq->fd);
                if (!file)
                    return -EBADF;
@@ -158,8 +154,8 @@ xfs_find_handle(
        }
        /* now copy our handle into the user buffer & write out the size */
-        if (copy_to_user(hreq.ohandle, &handle, hsize) ||
+        if (copy_to_user(hreq->ohandle, &handle, hsize) ||
-            copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
+            copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
                iput(inode);
                return -XFS_ERROR(EFAULT);
        }
@@ -249,10 +245,10 @@ xfs_vget_fsop_handlereq(
        return 0;
 }
-STATIC int
+int
 xfs_open_by_handle(
        xfs_mount_t             *mp,
-        void                    __user *arg,
+        xfs_fsop_handlereq_t    *hreq,
        struct file             *parfilp,
        struct inode            *parinode)
 {
@@ -263,14 +259,11 @@ xfs_open_by_handle(
        struct file             *filp;
        struct inode            *inode;
        struct dentry           *dentry;
-        xfs_fsop_handlereq_t    hreq;
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-        if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+        error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
        if (error)
                return -error;
@@ -281,10 +274,10 @@ xfs_open_by_handle(
        }
 #if BITS_PER_LONG != 32
-        hreq.oflags |= O_LARGEFILE;
+        hreq->oflags |= O_LARGEFILE;
 #endif
        /* Put open permission in namei format. */
-        permflag = hreq.oflags;
+        permflag = hreq->oflags;
        if ((permflag+1) & O_ACCMODE)
                permflag++;
        if (permflag & O_TRUNC)
@@ -322,15 +315,16 @@ xfs_open_by_handle(
        mntget(parfilp->f_path.mnt);
        /* Create file pointer. */
-        filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags, cred);
+        filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
        if (IS_ERR(filp)) {
                put_unused_fd(new_fd);
                return -XFS_ERROR(-PTR_ERR(filp));
        }
        if (inode->i_mode & S_IFREG) {
                /* invisible operation should not change atime */
                filp->f_flags |= O_NOATIME;
-                filp->f_op = &xfs_invis_file_operations;
+                filp->f_mode |= FMODE_NOCMTIME;
        }
        fd_install(new_fd, filp);
@@ -363,24 +357,21 @@ do_readlink(
 }
-STATIC int
+int
 xfs_readlink_by_handle(
        xfs_mount_t             *mp,
-        void                    __user *arg,
+        xfs_fsop_handlereq_t    *hreq,
        struct inode            *parinode)
 {
        struct inode            *inode;
-        xfs_fsop_handlereq_t    hreq;
        __u32                   olen;
        void                    *link;
        int                     error;
        if (!capable(CAP_SYS_ADMIN))
                return -XFS_ERROR(EPERM);
-        if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                return -XFS_ERROR(EFAULT);
-        error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+        error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
        if (error)
                return -error;
@@ -390,7 +381,7 @@ xfs_readlink_by_handle(
                goto out_iput;
        }
-        if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
+        if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
                error = -XFS_ERROR(EFAULT);
                goto out_iput;
        }
@@ -402,7 +393,7 @@ xfs_readlink_by_handle(
        error = -xfs_readlink(XFS_I(inode), link);
        if (error)
                goto out_kfree;
-        error = do_readlink(hreq.ohandle, olen, link);
+        error = do_readlink(hreq->ohandle, olen, link);
        if (error)
                goto out_kfree;
@@ -501,7 +492,7 @@ xfs_attrlist_by_handle(
        return -error;
 }
-STATIC int
+int
 xfs_attrmulti_attr_get(
        struct inode            *inode,
        char                    *name,
@@ -530,7 +521,7 @@ xfs_attrmulti_attr_get(
        return error;
 }
-STATIC int
+int
 xfs_attrmulti_attr_set(
        struct inode            *inode,
        char                    *name,
@@ -560,7 +551,7 @@ xfs_attrmulti_attr_set(
        return error;
 }
-STATIC int
+int
 xfs_attrmulti_attr_remove(
        struct inode            *inode,
        char                    *name,
@@ -662,19 +653,26 @@ xfs_attrmulti_by_handle(
        return -error;
 }
-STATIC int
+int
 xfs_ioc_space(
        struct xfs_inode        *ip,
        struct inode            *inode,
        struct file             *filp,
        int                     ioflags,
        unsigned int            cmd,
-        void                    __user *arg)
+        xfs_flock64_t           *bf)
 {
-        xfs_flock64_t           bf;
        int                     attr_flags = 0;
        int                     error;
+        /*
+         * Only allow the sys admin to reserve space unless
+         * unwritten extents are enabled.
+         */
+        if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
+            !capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
        if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
                return -XFS_ERROR(EPERM);
@@ -684,16 +682,12 @@ xfs_ioc_space(
        if (!S_ISREG(inode->i_mode))
                return -XFS_ERROR(EINVAL);
-        if (copy_from_user(&bf, arg, sizeof(bf)))
-                return -XFS_ERROR(EFAULT);
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
                attr_flags |= XFS_ATTR_NONBLOCK;
        if (ioflags & IO_INVIS)
                attr_flags |= XFS_ATTR_DMI;
-        error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos,
+        error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
-                                              NULL, attr_flags);
        return -error;
 }
@@ -1105,10 +1099,6 @@ xfs_ioctl_setattr(
        /*
         * Change file ownership.  Must be the owner or privileged.
-         * If the system was configured with the "restricted_chown"
-         * option, the owner is not permitted to give away the file,
-         * and can change the group id only to a group of which he
-         * or she is a member.
         */
        if (mask & FSX_PROJID) {
                /*
@@ -1137,7 +1127,7 @@ xfs_ioctl_setattr(
                         * the superblock version number since projids didn't
                         * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
                         */
-                        if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+                        if (ip->i_d.di_version == 1)
                                xfs_bump_ino_vers2(tp, ip);
                }
@@ -1256,43 +1246,67 @@ xfs_ioc_setxflags(
 }
 STATIC int
+xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+{
+        struct getbmap __user   *base = *ap;
+        /* copy only getbmap portion (not getbmapx) */
+        if (copy_to_user(base, bmv, sizeof(struct getbmap)))
+                return XFS_ERROR(EFAULT);
+        *ap += sizeof(struct getbmap);
+        return 0;
+}
+STATIC int
 xfs_ioc_getbmap(
        struct xfs_inode        *ip,
        int                     ioflags,
        unsigned int            cmd,
        void                    __user *arg)
 {
-        struct getbmap          bm;
+        struct getbmapx         bmx;
-        int                     iflags;
        int                     error;
-        if (copy_from_user(&bm, arg, sizeof(bm)))
+        if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
                return -XFS_ERROR(EFAULT);
-        if (bm.bmv_count < 2)
+        if (bmx.bmv_count < 2)
                return -XFS_ERROR(EINVAL);
-        iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+        bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
        if (ioflags & IO_INVIS)
-                iflags |= BMV_IF_NO_DMAPI_READ;
+                bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
-        error = xfs_getbmap(ip, &bm, (struct getbmap __user *)arg+1, iflags);
+        error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+                            (struct getbmap *)arg+1);
        if (error)
                return -error;
-        if (copy_to_user(arg, &bm, sizeof(bm)))
+        /* copy back header - only size of getbmap */
+        if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
 STATIC int
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+{
+        struct getbmapx __user  *base = *ap;
+        if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
+                return XFS_ERROR(EFAULT);
+        *ap += sizeof(struct getbmapx);
+        return 0;
+}
+STATIC int
 xfs_ioc_getbmapx(
        struct xfs_inode        *ip,
        void                    __user *arg)
 {
        struct getbmapx         bmx;
-        struct getbmap          bm;
-        int                     iflags;
        int                     error;
        if (copy_from_user(&bmx, arg, sizeof(bmx)))
@@ -1301,46 +1315,46 @@ xfs_ioc_getbmapx(
        if (bmx.bmv_count < 2)
                return -XFS_ERROR(EINVAL);
-        /*
+        if (bmx.bmv_iflags & (~BMV_IF_VALID))
-         * Map input getbmapx structure to a getbmap
-         * structure for xfs_getbmap.
-         */
-        GETBMAP_CONVERT(bmx, bm);
-        iflags = bmx.bmv_iflags;
-        if (iflags & (~BMV_IF_VALID))
                return -XFS_ERROR(EINVAL);
-        iflags |= BMV_IF_EXTENDED;
+        error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
+                            (struct getbmapx *)arg+1);
-        error = xfs_getbmap(ip, &bm, (struct getbmapx __user *)arg+1, iflags);
        if (error)
                return -error;
-        GETBMAP_CONVERT(bm, bmx);
+        /* copy back header */
+        if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
-        if (copy_to_user(arg, &bmx, sizeof(bmx)))
                return -XFS_ERROR(EFAULT);
        return 0;
 }
-int
+/*
-xfs_ioctl(
+ * Note: some of the ioctl's return positive numbers as a
-        xfs_inode_t             *ip,
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines.  This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
        struct file             *filp,
-        int                     ioflags,
        unsigned int            cmd,
-        void                    __user *arg)
+        unsigned long           p)
 {
        struct inode            *inode = filp->f_path.dentry->d_inode;
-        xfs_mount_t             *mp = ip->i_mount;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        void                    __user *arg = (void __user *)p;
+        int                     ioflags = 0;
        int                     error;
-        xfs_itrace_entry(XFS_I(inode));
+        if (filp->f_mode & FMODE_NOCMTIME)
-        switch (cmd) {
+                ioflags |= IO_INVIS;
+        xfs_itrace_entry(ip);
+        switch (cmd) {
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -1348,17 +1362,13 @@ xfs_ioctl(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP64:
        case XFS_IOC_RESVSP64:
-        case XFS_IOC_UNRESVSP64:
+        case XFS_IOC_UNRESVSP64: {
-                /*
+                xfs_flock64_t           bf;
-                 * Only allow the sys admin to reserve space unless
-                 * unwritten extents are enabled.
-                 */
-                if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
-                    !capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+                if (copy_from_user(&bf, arg, sizeof(bf)))
+                        return -XFS_ERROR(EFAULT);
+                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+        }
        case XFS_IOC_DIOINFO: {
                struct dioattr  da;
                xfs_buftarg_t   *target =
@@ -1418,18 +1428,30 @@ xfs_ioctl(
        case XFS_IOC_FD_TO_HANDLE:
        case XFS_IOC_PATH_TO_HANDLE:
-        case XFS_IOC_PATH_TO_FSHANDLE:
+        case XFS_IOC_PATH_TO_FSHANDLE: {
-                return xfs_find_handle(cmd, arg);
+                xfs_fsop_handlereq_t    hreq;
-        case XFS_IOC_OPEN_BY_HANDLE:
+                if (copy_from_user(&hreq, arg, sizeof(hreq)))
-                return xfs_open_by_handle(mp, arg, filp, inode);
+                        return -XFS_ERROR(EFAULT);
+                return xfs_find_handle(cmd, &hreq);
+        }
+        case XFS_IOC_OPEN_BY_HANDLE: {
+                xfs_fsop_handlereq_t    hreq;
+                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                        return -XFS_ERROR(EFAULT);
+                return xfs_open_by_handle(mp, &hreq, filp, inode);
+        }
        case XFS_IOC_FSSETDM_BY_HANDLE:
                return xfs_fssetdm_by_handle(mp, arg, inode);
-        case XFS_IOC_READLINK_BY_HANDLE:
+        case XFS_IOC_READLINK_BY_HANDLE: {
-                return xfs_readlink_by_handle(mp, arg, inode);
+                xfs_fsop_handlereq_t    hreq;
+                if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                        return -XFS_ERROR(EFAULT);
+                return xfs_readlink_by_handle(mp, &hreq, inode);
+        }
        case XFS_IOC_ATTRLIST_BY_HANDLE:
                return xfs_attrlist_by_handle(mp, arg, inode);
@@ -1437,7 +1459,11 @@ xfs_ioctl(
                return xfs_attrmulti_by_handle(mp, arg, filp, inode);
        case XFS_IOC_SWAPEXT: {
-                error = xfs_swapext((struct xfs_swapext __user *)arg);
+                struct xfs_swapext      sxp;
+                if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_swapext(&sxp);
                return -error;
        }
@@ -1493,9 +1519,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSDATA: {
                xfs_growfs_data_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
@@ -1506,9 +1529,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSLOG: {
                xfs_growfs_log_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
@@ -1519,9 +1539,6 @@ xfs_ioctl(
        case XFS_IOC_FSGROWFSRT: {
                xfs_growfs_rt_t in;
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
@@ -1529,21 +1546,6 @@ xfs_ioctl(
                return -error;
        }
-        case XFS_IOC_FREEZE:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (inode->i_sb->s_frozen == SB_UNFROZEN)
-                        freeze_bdev(inode->i_sb->s_bdev);
-                return 0;
-        case XFS_IOC_THAW:
-                if (!capable(CAP_SYS_ADMIN))
-                        return -EPERM;
-                if (inode->i_sb->s_frozen != SB_UNFROZEN)
-                        thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
-                return 0;
        case XFS_IOC_GOINGDOWN: {
                __uint32_t in;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
new file mode 100644
index 000000000000..8c16bf2d7e03
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+extern int
+xfs_ioc_space(
+        struct xfs_inode        *ip,
+        struct inode            *inode,
+        struct file             *filp,
+        int                     ioflags,
+        unsigned int            cmd,
+        xfs_flock64_t           *bf);
+extern int
+xfs_find_handle(
+        unsigned int            cmd,
+        xfs_fsop_handlereq_t    *hreq);
+extern int
+xfs_open_by_handle(
+        xfs_mount_t             *mp,
+        xfs_fsop_handlereq_t    *hreq,
+        struct file             *parfilp,
+        struct inode            *parinode);
+extern int
+xfs_readlink_by_handle(
+        xfs_mount_t             *mp,
+        xfs_fsop_handlereq_t    *hreq,
+        struct inode            *parinode);
+extern int
+xfs_attrmulti_attr_get(
+        struct inode            *inode,
+        char                    *name,
+        char                    __user *ubuf,
+        __uint32_t              *len,
+        __uint32_t              flags);
+extern int
+        xfs_attrmulti_attr_set(
+        struct inode            *inode,
+        char                    *name,
+        const char              __user *ubuf,
+        __uint32_t              len,
+        __uint32_t              flags);
+extern int
+xfs_attrmulti_attr_remove(
+        struct inode            *inode,
+        char                    *name,
+        __uint32_t              flags);
+extern long
+xfs_file_ioctl(
+        struct file             *filp,
+        unsigned int            cmd,
+        unsigned long           p);
+extern long
+xfs_file_compat_ioctl(
+        struct file             *file,
+        unsigned int            cmd,
+        unsigned long           arg);
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index a4b254eb43b2..50903ad3182e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -16,11 +16,7 @@
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
 #include <linux/compat.h>
-#include <linux/init.h>
 #include <linux/ioctl.h>
-#include <linux/syscalls.h>
-#include <linux/types.h>
-#include <linux/fs.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -36,7 +32,6 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dir2_sf.h"
-#include "xfs_vfs.h"
 #include "xfs_vnode.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
@@ -44,221 +39,219 @@
 #include "xfs_error.h"
 #include "xfs_dfrag.h"
 #include "xfs_vnodeops.h"
+#include "xfs_fsops.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
 #include "xfs_ioctl32.h"
 #define  _NATIVE_IOC(cmd, type) \
          _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#ifdef BROKEN_X86_ALIGNMENT
-#define BROKEN_X86_ALIGNMENT
+STATIC int
-#define _PACKED __attribute__((packed))
+xfs_compat_flock64_copyin(
-/* on ia32 l_start is on a 32-bit boundary */
+        xfs_flock64_t           *bf,
-typedef struct xfs_flock64_32 {
+        compat_xfs_flock64_t    __user *arg32)
-        __s16           l_type;
-        __s16           l_whence;
-        __s64           l_start __attribute__((packed));
-                        /* len == 0 means until end of file */
-        __s64           l_len __attribute__((packed));
-        __s32           l_sysid;
-        __u32           l_pid;
-        __s32           l_pad[4];       /* reserve area */
-} xfs_flock64_32_t;
-#define XFS_IOC_ALLOCSP_32      _IOW ('X', 10, struct xfs_flock64_32)
-#define XFS_IOC_FREESP_32       _IOW ('X', 11, struct xfs_flock64_32)
-#define XFS_IOC_ALLOCSP64_32    _IOW ('X', 36, struct xfs_flock64_32)
-#define XFS_IOC_FREESP64_32     _IOW ('X', 37, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP_32       _IOW ('X', 40, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP_32     _IOW ('X', 41, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP64_32     _IOW ('X', 42, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP64_32   _IOW ('X', 43, struct xfs_flock64_32)
-/* just account for different alignment */
-STATIC unsigned long
-xfs_ioctl32_flock(
-        unsigned long           arg)
 {
-        xfs_flock64_32_t        __user *p32 = (void __user *)arg;
+        if (get_user(bf->l_type,        &arg32->l_type) ||
-        xfs_flock64_t           __user *p = compat_alloc_user_space(sizeof(*p));
+            get_user(bf->l_whence,      &arg32->l_whence) ||
+            get_user(bf->l_start,       &arg32->l_start) ||
-        if (copy_in_user(&p->l_type,    &p32->l_type,   sizeof(s16)) ||
+            get_user(bf->l_len,         &arg32->l_len) ||
-            copy_in_user(&p->l_whence,  &p32->l_whence, sizeof(s16)) ||
+            get_user(bf->l_sysid,       &arg32->l_sysid) ||
-            copy_in_user(&p->l_start,   &p32->l_start,  sizeof(s64)) ||
+            get_user(bf->l_pid,         &arg32->l_pid) ||
-            copy_in_user(&p->l_len,     &p32->l_len,    sizeof(s64)) ||
+            copy_from_user(bf->l_pad,   &arg32->l_pad,  4*sizeof(u32)))
-            copy_in_user(&p->l_sysid,   &p32->l_sysid,  sizeof(s32)) ||
+                return -XFS_ERROR(EFAULT);
-            copy_in_user(&p->l_pid,     &p32->l_pid,    sizeof(u32)) ||
+        return 0;
-            copy_in_user(&p->l_pad,     &p32->l_pad,    4*sizeof(u32)))
-                return -EFAULT;
-        return (unsigned long)p;
 }
-typedef struct compat_xfs_fsop_geom_v1 {
+STATIC int
-        __u32           blocksize;      /* filesystem (data) block size */
+xfs_compat_ioc_fsgeometry_v1(
-        __u32           rtextsize;      /* realtime extent size         */
+        struct xfs_mount          *mp,
-        __u32           agblocks;       /* fsblocks in an AG            */
+        compat_xfs_fsop_geom_v1_t __user *arg32)
-        __u32           agcount;        /* number of allocation groups  */
-        __u32           logblocks;      /* fsblocks in the log          */
-        __u32           sectsize;       /* (data) sector size, bytes    */
-        __u32           inodesize;      /* inode size in bytes          */
-        __u32           imaxpct;        /* max allowed inode space(%)   */
-        __u64           datablocks;     /* fsblocks in data subvolume   */
-        __u64           rtblocks;       /* fsblocks in realtime subvol  */
-        __u64           rtextents;      /* rt extents in realtime subvol*/
-        __u64           logstart;       /* starting fsblock of the log  */
-        unsigned char   uuid[16];       /* unique id of the filesystem  */
-        __u32           sunit;          /* stripe unit, fsblocks        */
-        __u32           swidth;         /* stripe width, fsblocks       */
-        __s32           version;        /* structure version            */
-        __u32           flags;          /* superblock version flags     */
-        __u32           logsectsize;    /* log sector size, bytes       */
-        __u32           rtsectsize;     /* realtime sector size, bytes  */
-        __u32           dirblocksize;   /* directory block size, bytes  */
-} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
-#define XFS_IOC_FSGEOMETRY_V1_32  \
-        _IOR ('X', 100, struct compat_xfs_fsop_geom_v1)
-STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg)
 {
-        compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg;
+        xfs_fsop_geom_t           fsgeo;
-        xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p));
+        int                       error;
-        if (copy_in_user(p, p32, sizeof(*p32)))
+        error = xfs_fs_geometry(mp, &fsgeo, 3);
-                return -EFAULT;
+        if (error)
-        return (unsigned long)p;
+                return -error;
+        /* The 32-bit variant simply has some padding at the end */
+        if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
 }
-typedef struct compat_xfs_inogrp {
+STATIC int
-        __u64           xi_startino;    /* starting inode number        */
+xfs_compat_growfs_data_copyin(
-        __s32           xi_alloccount;  /* # bits set in allocmask      */
+        struct xfs_growfs_data   *in,
-        __u64           xi_allocmask;   /* mask of allocated inodes     */
+        compat_xfs_growfs_data_t __user *arg32)
-} __attribute__((packed)) compat_xfs_inogrp_t;
-STATIC int xfs_inumbers_fmt_compat(
-        void __user *ubuffer,
-        const xfs_inogrp_t *buffer,
-        long count,
-        long *written)
 {
-        compat_xfs_inogrp_t __user *p32 = ubuffer;
+        if (get_user(in->newblocks, &arg32->newblocks) ||
-        long i;
+            get_user(in->imaxpct,   &arg32->imaxpct))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+STATIC int
+xfs_compat_growfs_rt_copyin(
+        struct xfs_growfs_rt     *in,
+        compat_xfs_growfs_rt_t  __user *arg32)
+{
+        if (get_user(in->newblocks, &arg32->newblocks) ||
+            get_user(in->extsize,   &arg32->extsize))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
+STATIC int
+xfs_inumbers_fmt_compat(
+        void                    __user *ubuffer,
+        const xfs_inogrp_t      *buffer,
+        long                    count,
+        long                    *written)
+{
+        compat_xfs_inogrp_t     __user *p32 = ubuffer;
+        long                    i;
        for (i = 0; i < count; i++) {
                if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
                    put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
                    put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
-                        return -EFAULT;
+                        return -XFS_ERROR(EFAULT);
        }
        *written = count * sizeof(*p32);
        return 0;
 }
 #else
 #define xfs_inumbers_fmt_compat xfs_inumbers_fmt
-#define _PACKED
+#endif  /* BROKEN_X86_ALIGNMENT */
-#endif
+STATIC int
+xfs_ioctl32_bstime_copyin(
+        xfs_bstime_t            *bstime,
+        compat_xfs_bstime_t     __user *bstime32)
+{
+        compat_time_t           sec32;  /* tv_sec differs on 64 vs. 32 */
-/* XFS_IOC_FSBULKSTAT and friends */
+        if (get_user(sec32,             &bstime32->tv_sec)      ||
+            get_user(bstime->tv_nsec,   &bstime32->tv_nsec))
+                return -XFS_ERROR(EFAULT);
+        bstime->tv_sec = sec32;
+        return 0;
+}
+/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+        xfs_bstat_t             *bstat,
+        compat_xfs_bstat_t      __user *bstat32)
+{
+        if (get_user(bstat->bs_ino,     &bstat32->bs_ino)       ||
+            get_user(bstat->bs_mode,    &bstat32->bs_mode)      ||
+            get_user(bstat->bs_nlink,   &bstat32->bs_nlink)     ||
+            get_user(bstat->bs_uid,     &bstat32->bs_uid)       ||
+            get_user(bstat->bs_gid,     &bstat32->bs_gid)       ||
+            get_user(bstat->bs_rdev,    &bstat32->bs_rdev)      ||
+            get_user(bstat->bs_blksize, &bstat32->bs_blksize)   ||
+            get_user(bstat->bs_size,    &bstat32->bs_size)      ||
+            xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+            xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+            xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+            get_user(bstat->bs_blocks,  &bstat32->bs_size)      ||
+            get_user(bstat->bs_xflags,  &bstat32->bs_size)      ||
+            get_user(bstat->bs_extsize, &bstat32->bs_extsize)   ||
+            get_user(bstat->bs_extents, &bstat32->bs_extents)   ||
+            get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
+            get_user(bstat->bs_projid,  &bstat32->bs_projid)    ||
+            get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
+            get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
+            get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
-typedef struct compat_xfs_bstime {
+/* XFS_IOC_FSBULKSTAT and friends */
-        __s32           tv_sec;         /* seconds              */
-        __s32           tv_nsec;        /* and nanoseconds      */
-} compat_xfs_bstime_t;
-STATIC int xfs_bstime_store_compat(
+STATIC int
-        compat_xfs_bstime_t __user *p32,
+xfs_bstime_store_compat(
-        const xfs_bstime_t *p)
+        compat_xfs_bstime_t     __user *p32,
+        const xfs_bstime_t      *p)
 {
-        __s32 sec32;
+        __s32                   sec32;
        sec32 = p->tv_sec;
        if (put_user(sec32, &p32->tv_sec) ||
            put_user(p->tv_nsec, &p32->tv_nsec))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        return 0;
 }
-typedef struct compat_xfs_bstat {
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
-        __u64           bs_ino;         /* inode number                 */
+STATIC int
-        __u16           bs_mode;        /* type and mode                */
+xfs_bulkstat_one_fmt_compat(
-        __u16           bs_nlink;       /* number of links              */
-        __u32           bs_uid;         /* user id                      */
-        __u32           bs_gid;         /* group id                     */
-        __u32           bs_rdev;        /* device value                 */
-        __s32           bs_blksize;     /* block size                   */
-        __s64           bs_size;        /* file size                    */
-        compat_xfs_bstime_t bs_atime;   /* access time                  */
-        compat_xfs_bstime_t bs_mtime;   /* modify time                  */
-        compat_xfs_bstime_t bs_ctime;   /* inode change time            */
-        int64_t         bs_blocks;      /* number of blocks             */
-        __u32           bs_xflags;      /* extended flags               */
-        __s32           bs_extsize;     /* extent size                  */
-        __s32           bs_extents;     /* number of extents            */
-        __u32           bs_gen;         /* generation count             */
-        __u16           bs_projid;      /* project id                   */
-        unsigned char   bs_pad[14];     /* pad space, unused            */
-        __u32           bs_dmevmask;    /* DMIG event mask              */
-        __u16           bs_dmstate;     /* DMIG state info              */
-        __u16           bs_aextents;    /* attribute number of extents  */
-} _PACKED compat_xfs_bstat_t;
-STATIC int xfs_bulkstat_one_fmt_compat(
        void                    __user *ubuffer,
+        int                     ubsize,
+        int                     *ubused,
        const xfs_bstat_t       *buffer)
 {
-        compat_xfs_bstat_t __user *p32 = ubuffer;
+        compat_xfs_bstat_t      __user *p32 = ubuffer;
-        if (put_user(buffer->bs_ino, &p32->bs_ino) ||
+        if (ubsize < sizeof(*p32))
-            put_user(buffer->bs_mode, &p32->bs_mode) ||
+                return XFS_ERROR(ENOMEM);
-            put_user(buffer->bs_nlink, &p32->bs_nlink) ||
-            put_user(buffer->bs_uid, &p32->bs_uid) ||
+        if (put_user(buffer->bs_ino,      &p32->bs_ino)         ||
-            put_user(buffer->bs_gid, &p32->bs_gid) ||
+            put_user(buffer->bs_mode,     &p32->bs_mode)        ||
-            put_user(buffer->bs_rdev, &p32->bs_rdev) ||
+            put_user(buffer->bs_nlink,    &p32->bs_nlink)       ||
-            put_user(buffer->bs_blksize, &p32->bs_blksize) ||
+            put_user(buffer->bs_uid,      &p32->bs_uid)         ||
-            put_user(buffer->bs_size, &p32->bs_size) ||
+            put_user(buffer->bs_gid,      &p32->bs_gid)         ||
+            put_user(buffer->bs_rdev,     &p32->bs_rdev)        ||
+            put_user(buffer->bs_blksize,  &p32->bs_blksize)     ||
+            put_user(buffer->bs_size,     &p32->bs_size)        ||
            xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
            xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
            xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
-            put_user(buffer->bs_blocks, &p32->bs_blocks) ||
+            put_user(buffer->bs_blocks,   &p32->bs_blocks)      ||
-            put_user(buffer->bs_xflags, &p32->bs_xflags) ||
+            put_user(buffer->bs_xflags,   &p32->bs_xflags)      ||
-            put_user(buffer->bs_extsize, &p32->bs_extsize) ||
+            put_user(buffer->bs_extsize,  &p32->bs_extsize)     ||
-            put_user(buffer->bs_extents, &p32->bs_extents) ||
+            put_user(buffer->bs_extents,  &p32->bs_extents)     ||
-            put_user(buffer->bs_gen, &p32->bs_gen) ||
+            put_user(buffer->bs_gen,      &p32->bs_gen)         ||
-            put_user(buffer->bs_projid, &p32->bs_projid) ||
+            put_user(buffer->bs_projid,   &p32->bs_projid)      ||
-            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
+            put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
-            put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
+            put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
            put_user(buffer->bs_aextents, &p32->bs_aextents))
-                return -EFAULT;
+                return XFS_ERROR(EFAULT);
-        return sizeof(*p32);
+        if (ubused)
+                *ubused = sizeof(*p32);
+        return 0;
 }
+STATIC int
+xfs_bulkstat_one_compat(
-typedef struct compat_xfs_fsop_bulkreq {
+        xfs_mount_t     *mp,            /* mount point for filesystem */
-        compat_uptr_t   lastip;         /* last inode # pointer         */
+        xfs_ino_t       ino,            /* inode number to get data for */
-        __s32           icount;         /* count of entries in buffer   */
+        void            __user *buffer, /* buffer to place output in */
-        compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
+        int             ubsize,         /* size of buffer */
-        compat_uptr_t   ocount;         /* output count pointer         */
+        void            *private_data,  /* my private data */
-} compat_xfs_fsop_bulkreq_t;
+        xfs_daddr_t     bno,            /* starting bno of inode cluster */
+        int             *ubused,        /* bytes used by me */
-#define XFS_IOC_FSBULKSTAT_32 \
+        void            *dibuff,        /* on-disk inode buffer */
-        _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+        int             *stat)          /* BULKSTAT_RV_... */
-#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+{
-        _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+        return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-#define XFS_IOC_FSINUMBERS_32 \
+                                    xfs_bulkstat_one_fmt_compat, bno,
-        _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+                                    ubused, dibuff, stat);
+}
 /* copied from xfs_ioctl.c */
 STATIC int
-xfs_ioc_bulkstat_compat(
+xfs_compat_ioc_bulkstat(
-        xfs_mount_t             *mp,
+        xfs_mount_t               *mp,
-        unsigned int            cmd,
+        unsigned int              cmd,
-        void                    __user *arg)
+        compat_xfs_fsop_bulkreq_t __user *p32)
 {
-        compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg;
        u32                     addr;
        xfs_fsop_bulkreq_t      bulkreq;
        int                     count;  /* # of records returned */
@@ -270,20 +263,20 @@ xfs_ioc_bulkstat_compat(
        /* should be called again (unused here, but used in dmapi) */
        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
+                return -XFS_ERROR(EPERM);
        if (XFS_FORCED_SHUTDOWN(mp))
                return -XFS_ERROR(EIO);
        if (get_user(addr, &p32->lastip))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        bulkreq.lastip = compat_ptr(addr);
        if (get_user(bulkreq.icount, &p32->icount) ||
            get_user(addr, &p32->ubuffer))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        bulkreq.ubuffer = compat_ptr(addr);
        if (get_user(addr, &p32->ocount))
-                return -EFAULT;
+                return -XFS_ERROR(EFAULT);
        bulkreq.ocount = compat_ptr(addr);
        if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
@@ -295,17 +288,22 @@ xfs_ioc_bulkstat_compat(
        if (bulkreq.ubuffer == NULL)
                return -XFS_ERROR(EINVAL);
-        if (cmd == XFS_IOC_FSINUMBERS)
+        if (cmd == XFS_IOC_FSINUMBERS_32) {
                error = xfs_inumbers(mp, &inlast, &count,
                                bulkreq.ubuffer, xfs_inumbers_fmt_compat);
-        else {
+        } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
-                /* declare a var to get a warning in case the type changes */
+                int res;
-                bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat;
+                error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
+                                sizeof(compat_xfs_bstat_t),
+                                NULL, 0, NULL, NULL, &res);
+        } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
                error = xfs_bulkstat(mp, &inlast, &count,
-                        xfs_bulkstat_one, formatter,
+                        xfs_bulkstat_one_compat, NULL,
                        sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
                        BULKSTAT_FG_QUICK, &done);
-        }
+        } else
+                error = XFS_ERROR(EINVAL);
        if (error)
                return -error;
@@ -321,63 +319,306 @@ xfs_ioc_bulkstat_compat(
        return 0;
 }
+STATIC int
+xfs_compat_handlereq_copyin(
+        xfs_fsop_handlereq_t            *hreq,
+        compat_xfs_fsop_handlereq_t     __user *arg32)
+{
+        compat_xfs_fsop_handlereq_t     hreq32;
+        if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        hreq->fd = hreq32.fd;
+        hreq->path = compat_ptr(hreq32.path);
+        hreq->oflags = hreq32.oflags;
+        hreq->ihandle = compat_ptr(hreq32.ihandle);
+        hreq->ihandlen = hreq32.ihandlen;
+        hreq->ohandle = compat_ptr(hreq32.ohandle);
+        hreq->ohandlen = compat_ptr(hreq32.ohandlen);
+        return 0;
+}
-typedef struct compat_xfs_fsop_handlereq {
+/*
-        __u32           fd;             /* fd for FD_TO_HANDLE          */
+ * Convert userspace handle data into inode.
-        compat_uptr_t   path;           /* user pathname                */
+ *
-        __u32           oflags;         /* open flags                   */
+ * We use the fact that all the fsop_handlereq ioctl calls have a data
-        compat_uptr_t   ihandle;        /* user supplied handle         */
+ * structure argument whose first component is always a xfs_fsop_handlereq_t,
-        __u32           ihandlen;       /* user supplied length         */
+ * so we can pass that sub structure into this handy, shared routine.
-        compat_uptr_t   ohandle;        /* user buffer for handle       */
+ *
-        compat_uptr_t   ohandlen;       /* user buffer length           */
+ * If no error, caller must always iput the returned inode.
-} compat_xfs_fsop_handlereq_t;
+ */
+STATIC int
-#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+xfs_vget_fsop_handlereq_compat(
-        _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+        xfs_mount_t             *mp,
-#define XFS_IOC_PATH_TO_HANDLE_32 \
+        struct inode            *parinode,      /* parent inode pointer    */
-        _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+        compat_xfs_fsop_handlereq_t     *hreq,
-#define XFS_IOC_FD_TO_HANDLE_32 \
+        struct inode            **inode)
-        _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_OPEN_BY_HANDLE_32 \
-        _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_READLINK_BY_HANDLE_32 \
-        _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
-STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg)
 {
-        compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg;
+        void                    __user *hanp;
-        xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p));
+        size_t                  hlen;
-        u32 addr;
+        xfs_fid_t               *xfid;
+        xfs_handle_t            *handlep;
-        if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) ||
+        xfs_handle_t            handle;
-            get_user(addr, &p32->path) ||
+        xfs_inode_t             *ip;
-            put_user(compat_ptr(addr), &p->path) ||
+        xfs_ino_t               ino;
-            copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) ||
+        __u32                   igen;
-            get_user(addr, &p32->ihandle) ||
+        int                     error;
-            put_user(compat_ptr(addr), &p->ihandle) ||
-            copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) ||
+        /*
-            get_user(addr, &p32->ohandle) ||
+         * Only allow handle opens under a directory.
-            put_user(compat_ptr(addr), &p->ohandle) ||
+         */
-            get_user(addr, &p32->ohandlen) ||
+        if (!S_ISDIR(parinode->i_mode))
-            put_user(compat_ptr(addr), &p->ohandlen))
+                return XFS_ERROR(ENOTDIR);
-                return -EFAULT;
+        hanp = compat_ptr(hreq->ihandle);
-        return (unsigned long)p;
+        hlen = hreq->ihandlen;
+        handlep = &handle;
+        if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+                return XFS_ERROR(EINVAL);
+        if (copy_from_user(handlep, hanp, hlen))
+                return XFS_ERROR(EFAULT);
+        if (hlen < sizeof(*handlep))
+                memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
+        if (hlen > sizeof(handlep->ha_fsid)) {
+                if (handlep->ha_fid.fid_len !=
+                    (hlen - sizeof(handlep->ha_fsid) -
+                            sizeof(handlep->ha_fid.fid_len)) ||
+                    handlep->ha_fid.fid_pad)
+                        return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Crack the handle, obtain the inode # & generation #
+         */
+        xfid = (struct xfs_fid *)&handlep->ha_fid;
+        if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
+                ino  = xfid->fid_ino;
+                igen = xfid->fid_gen;
+        } else {
+                return XFS_ERROR(EINVAL);
+        }
+        /*
+         * Get the XFS inode, building a Linux inode to go with it.
+         */
+        error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+        if (error)
+                return error;
+        if (ip == NULL)
+                return XFS_ERROR(EIO);
+        if (ip->i_d.di_gen != igen) {
+                xfs_iput_new(ip, XFS_ILOCK_SHARED);
+                return XFS_ERROR(ENOENT);
+        }
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        *inode = VFS_I(ip);
+        return 0;
 }
+STATIC int
+xfs_compat_attrlist_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct inode            *parinode)
+{
+        int                     error;
+        attrlist_cursor_kern_t  *cursor;
+        compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+        struct inode            *inode;
+        char                    *kbuf;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&al_hreq, arg,
+                           sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        if (al_hreq.buflen > XATTR_LIST_MAX)
+                return -XFS_ERROR(EINVAL);
+        /*
+         * Reject flags, only allow namespaces.
+         */
+        if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+                return -XFS_ERROR(EINVAL);
+        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq,
+                                               &inode);
+        if (error)
+                goto out;
+        kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+        if (!kbuf)
+                goto out_vn_rele;
+        cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+        error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+                                        al_hreq.flags, cursor);
+        if (error)
+                goto out_kfree;
+        if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
+                error = -EFAULT;
+ out_kfree:
+        kfree(kbuf);
+ out_vn_rele:
+        iput(inode);
+ out:
+        return -error;
+}
-STATIC long
+STATIC int
-xfs_compat_ioctl(
+xfs_compat_attrmulti_by_handle(
-        int             mode,
+        xfs_mount_t                             *mp,
-        struct file     *file,
+        void                                    __user *arg,
-        unsigned        cmd,
+        struct inode                            *parinode)
-        unsigned long   arg)
+{
+        int                                     error;
+        compat_xfs_attr_multiop_t               *ops;
+        compat_xfs_fsop_attrmulti_handlereq_t   am_hreq;
+        struct inode                            *inode;
+        unsigned int                            i, size;
+        char                                    *attr_name;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&am_hreq, arg,
+                           sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq,
+                                               &inode);
+        if (error)
+                goto out;
+        error = E2BIG;
+        size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+        if (!size || size > 16 * PAGE_SIZE)
+                goto out_vn_rele;
+        error = ENOMEM;
+        ops = kmalloc(size, GFP_KERNEL);
+        if (!ops)
+                goto out_vn_rele;
+        error = EFAULT;
+        if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
+                goto out_kfree_ops;
+        attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+        if (!attr_name)
+                goto out_kfree_ops;
+        error = 0;
+        for (i = 0; i < am_hreq.opcount; i++) {
+                ops[i].am_error = strncpy_from_user(attr_name,
+                                compat_ptr(ops[i].am_attrname),
+                                MAXNAMELEN);
+                if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+                        error = -ERANGE;
+                if (ops[i].am_error < 0)
+                        break;
+                switch (ops[i].am_opcode) {
+                case ATTR_OP_GET:
+                        ops[i].am_error = xfs_attrmulti_attr_get(inode,
+                                        attr_name,
+                                        compat_ptr(ops[i].am_attrvalue),
+                                        &ops[i].am_length, ops[i].am_flags);
+                        break;
+                case ATTR_OP_SET:
+                        ops[i].am_error = xfs_attrmulti_attr_set(inode,
+                                        attr_name,
+                                        compat_ptr(ops[i].am_attrvalue),
+                                        ops[i].am_length, ops[i].am_flags);
+                        break;
+                case ATTR_OP_REMOVE:
+                        ops[i].am_error = xfs_attrmulti_attr_remove(inode,
+                                        attr_name, ops[i].am_flags);
+                        break;
+                default:
+                        ops[i].am_error = EINVAL;
+                }
+        }
+        if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+                error = XFS_ERROR(EFAULT);
+        kfree(attr_name);
+ out_kfree_ops:
+        kfree(ops);
+ out_vn_rele:
+        iput(inode);
+ out:
+        return -error;
+}
+STATIC int
+xfs_compat_fssetdm_by_handle(
+        xfs_mount_t             *mp,
+        void                    __user *arg,
+        struct inode            *parinode)
+{
+        int                     error;
+        struct fsdmidata        fsd;
+        compat_xfs_fsop_setdm_handlereq_t dmhreq;
+        struct inode            *inode;
+        if (!capable(CAP_MKNOD))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&dmhreq, arg,
+                           sizeof(compat_xfs_fsop_setdm_handlereq_t)))
+                return -XFS_ERROR(EFAULT);
+        error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq,
+                                               &inode);
+        if (error)
+                return -error;
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+                error = -XFS_ERROR(EPERM);
+                goto out;
+        }
+        if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
+                error = -XFS_ERROR(EFAULT);
+                goto out;
+        }
+        error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+                                 fsd.fsd_dmstate);
+out:
+        iput(inode);
+        return error;
+}
+long
+xfs_file_compat_ioctl(
+        struct file             *filp,
+        unsigned                cmd,
+        unsigned long           p)
 {
-        struct inode    *inode = file->f_path.dentry->d_inode;
+        struct inode            *inode = filp->f_path.dentry->d_inode;
-        int             error;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        void                    __user *arg = (void __user *)p;
+        int                     ioflags = 0;
+        int                     error;
+        if (filp->f_mode & FMODE_NOCMTIME)
+                ioflags |= IO_INVIS;
+        xfs_itrace_entry(ip);
        switch (cmd) {
+        /* No size or alignment issues on any arch */
        case XFS_IOC_DIOINFO:
        case XFS_IOC_FSGEOMETRY:
        case XFS_IOC_FSGETXATTR:
@@ -387,48 +628,16 @@ xfs_compat_ioctl(
        case XFS_IOC_GETBMAP:
        case XFS_IOC_GETBMAPA:
        case XFS_IOC_GETBMAPX:
-/* not handled
-        case XFS_IOC_FSSETDM_BY_HANDLE:
-        case XFS_IOC_ATTRLIST_BY_HANDLE:
-        case XFS_IOC_ATTRMULTI_BY_HANDLE:
-*/
        case XFS_IOC_FSCOUNTS:
        case XFS_IOC_SET_RESBLKS:
        case XFS_IOC_GET_RESBLKS:
-        case XFS_IOC_FSGROWFSDATA:
        case XFS_IOC_FSGROWFSLOG:
-        case XFS_IOC_FSGROWFSRT:
-        case XFS_IOC_FREEZE:
-        case XFS_IOC_THAW:
        case XFS_IOC_GOINGDOWN:
        case XFS_IOC_ERROR_INJECTION:
        case XFS_IOC_ERROR_CLEARALL:
-                break;
+                return xfs_file_ioctl(filp, cmd, p);
+#ifndef BROKEN_X86_ALIGNMENT
-        case XFS_IOC32_GETXFLAGS:
+        /* These are handled fine if no alignment issues */
-        case XFS_IOC32_SETXFLAGS:
-        case XFS_IOC32_GETVERSION:
-                cmd = _NATIVE_IOC(cmd, long);
-                break;
-#ifdef BROKEN_X86_ALIGNMENT
-        /* xfs_flock_t has wrong u32 vs u64 alignment */
-        case XFS_IOC_ALLOCSP_32:
-        case XFS_IOC_FREESP_32:
-        case XFS_IOC_ALLOCSP64_32:
-        case XFS_IOC_FREESP64_32:
-        case XFS_IOC_RESVSP_32:
-        case XFS_IOC_UNRESVSP_32:
-        case XFS_IOC_RESVSP64_32:
-        case XFS_IOC_UNRESVSP64_32:
-                arg = xfs_ioctl32_flock(arg);
-                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-                break;
-        case XFS_IOC_FSGEOMETRY_V1_32:
-                arg = xfs_ioctl32_geom_v1(arg);
-                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1);
-                break;
-#else /* These are handled fine if no alignment issues */
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -438,51 +647,97 @@ xfs_compat_ioctl(
        case XFS_IOC_RESVSP64:
        case XFS_IOC_UNRESVSP64:
        case XFS_IOC_FSGEOMETRY_V1:
-                break;
+        case XFS_IOC_FSGROWFSDATA:
+        case XFS_IOC_FSGROWFSRT:
+                return xfs_file_ioctl(filp, cmd, p);
+#else
+        case XFS_IOC_ALLOCSP_32:
+        case XFS_IOC_FREESP_32:
+        case XFS_IOC_ALLOCSP64_32:
+        case XFS_IOC_FREESP64_32:
+        case XFS_IOC_RESVSP_32:
+        case XFS_IOC_UNRESVSP_32:
+        case XFS_IOC_RESVSP64_32:
+        case XFS_IOC_UNRESVSP64_32: {
+                struct xfs_flock64      bf;
-        /* xfs_bstat_t still has wrong u32 vs u64 alignment */
+                if (xfs_compat_flock64_copyin(&bf, arg))
-        case XFS_IOC_SWAPEXT:
+                        return -XFS_ERROR(EFAULT);
-                break;
+                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+                return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+        }
+        case XFS_IOC_FSGEOMETRY_V1_32:
+                return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+        case XFS_IOC_FSGROWFSDATA_32: {
+                struct xfs_growfs_data  in;
+                if (xfs_compat_growfs_data_copyin(&in, arg))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_data(mp, &in);
+                return -error;
+        }
+        case XFS_IOC_FSGROWFSRT_32: {
+                struct xfs_growfs_rt    in;
+                if (xfs_compat_growfs_rt_copyin(&in, arg))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_growfs_rt(mp, &in);
+                return -error;
+        }
 #endif
+        /* long changes size, but xfs only copiese out 32 bits */
+        case XFS_IOC_GETXFLAGS_32:
+        case XFS_IOC_SETXFLAGS_32:
+        case XFS_IOC_GETVERSION_32:
+                cmd = _NATIVE_IOC(cmd, long);
+                return xfs_file_ioctl(filp, cmd, p);
+        case XFS_IOC_SWAPEXT: {
+                struct xfs_swapext        sxp;
+                struct compat_xfs_swapext __user *sxu = arg;
+                /* Bulk copy in up to the sx_stat field, then copy bstat */
+                if (copy_from_user(&sxp, sxu,
+                                   offsetof(struct xfs_swapext, sx_stat)) ||
+                    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+                        return -XFS_ERROR(EFAULT);
+                error = xfs_swapext(&sxp);
+                return -error;
+        }
        case XFS_IOC_FSBULKSTAT_32:
        case XFS_IOC_FSBULKSTAT_SINGLE_32:
        case XFS_IOC_FSINUMBERS_32:
-                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq);
+                return xfs_compat_ioc_bulkstat(mp, cmd, arg);
-                return xfs_ioc_bulkstat_compat(XFS_I(inode)->i_mount,
-                                cmd, (void __user*)arg);
        case XFS_IOC_FD_TO_HANDLE_32:
        case XFS_IOC_PATH_TO_HANDLE_32:
-        case XFS_IOC_PATH_TO_FSHANDLE_32:
+        case XFS_IOC_PATH_TO_FSHANDLE_32: {
-        case XFS_IOC_OPEN_BY_HANDLE_32:
+                struct xfs_fsop_handlereq       hreq;
-        case XFS_IOC_READLINK_BY_HANDLE_32:
-                arg = xfs_ioctl32_fshandle(arg);
+                if (xfs_compat_handlereq_copyin(&hreq, arg))
+                        return -XFS_ERROR(EFAULT);
                cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
-                break;
+                return xfs_find_handle(cmd, &hreq);
-        default:
-                return -ENOIOCTLCMD;
        }
+        case XFS_IOC_OPEN_BY_HANDLE_32: {
+                struct xfs_fsop_handlereq       hreq;
-        error = xfs_ioctl(XFS_I(inode), file, mode, cmd, (void __user *)arg);
+                if (xfs_compat_handlereq_copyin(&hreq, arg))
-        xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
+                        return -XFS_ERROR(EFAULT);
+                return xfs_open_by_handle(mp, &hreq, filp, inode);
-        return error;
+        }
-}
+        case XFS_IOC_READLINK_BY_HANDLE_32: {
+                struct xfs_fsop_handlereq       hreq;
-long
-xfs_file_compat_ioctl(
-        struct file             *file,
-        unsigned                cmd,
-        unsigned long           arg)
-{
-        return xfs_compat_ioctl(0, file, cmd, arg);
-}
-long
+                if (xfs_compat_handlereq_copyin(&hreq, arg))
-xfs_file_compat_invis_ioctl(
+                        return -XFS_ERROR(EFAULT);
-        struct file             *file,
+                return xfs_readlink_by_handle(mp, &hreq, inode);
-        unsigned                cmd,
+        }
-        unsigned long           arg)
+        case XFS_IOC_ATTRLIST_BY_HANDLE_32:
-{
+                return xfs_compat_attrlist_by_handle(mp, arg, inode);
-        return xfs_compat_ioctl(IO_INVIS, file, cmd, arg);
+        case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+                return xfs_compat_attrmulti_by_handle(mp, arg, inode);
+        case XFS_IOC_FSSETDM_BY_HANDLE_32:
+                return xfs_compat_fssetdm_by_handle(mp, arg, inode);
+        default:
+                return -XFS_ERROR(ENOIOCTLCMD);
+        }
 }
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 02de6e62ee37..1024c4f8ba0d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -18,7 +18,217 @@
 #ifndef __XFS_IOCTL32_H__
 #define __XFS_IOCTL32_H__
-extern long xfs_file_compat_ioctl(struct file *, unsigned, unsigned long);
+#include <linux/compat.h>
-extern long xfs_file_compat_invis_ioctl(struct file *, unsigned, unsigned long);
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment.  We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETXFLAGS_32    FS_IOC32_GETFLAGS
+#define XFS_IOC_SETXFLAGS_32    FS_IOC32_SETFLAGS
+#define XFS_IOC_GETVERSION_32   FS_IOC32_GETVERSION
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+typedef struct compat_xfs_bstime {
+        compat_time_t   tv_sec;         /* seconds              */
+        __s32           tv_nsec;        /* and nanoseconds      */
+} compat_xfs_bstime_t;
+typedef struct compat_xfs_bstat {
+        __u64           bs_ino;         /* inode number                 */
+        __u16           bs_mode;        /* type and mode                */
+        __u16           bs_nlink;       /* number of links              */
+        __u32           bs_uid;         /* user id                      */
+        __u32           bs_gid;         /* group id                     */
+        __u32           bs_rdev;        /* device value                 */
+        __s32           bs_blksize;     /* block size                   */
+        __s64           bs_size;        /* file size                    */
+        compat_xfs_bstime_t bs_atime;   /* access time                  */
+        compat_xfs_bstime_t bs_mtime;   /* modify time                  */
+        compat_xfs_bstime_t bs_ctime;   /* inode change time            */
+        int64_t         bs_blocks;      /* number of blocks             */
+        __u32           bs_xflags;      /* extended flags               */
+        __s32           bs_extsize;     /* extent size                  */
+        __s32           bs_extents;     /* number of extents            */
+        __u32           bs_gen;         /* generation count             */
+        __u16           bs_projid;      /* project id                   */
+        unsigned char   bs_pad[14];     /* pad space, unused            */
+        __u32           bs_dmevmask;    /* DMIG event mask              */
+        __u16           bs_dmstate;     /* DMIG state info              */
+        __u16           bs_aextents;    /* attribute number of extents  */
+} __compat_packed compat_xfs_bstat_t;
+typedef struct compat_xfs_fsop_bulkreq {
+        compat_uptr_t   lastip;         /* last inode # pointer         */
+        __s32           icount;         /* count of entries in buffer   */
+        compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
+        compat_uptr_t   ocount;         /* output count pointer         */
+} compat_xfs_fsop_bulkreq_t;
+#define XFS_IOC_FSBULKSTAT_32 \
+        _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+        _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+        _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+typedef struct compat_xfs_fsop_handlereq {
+        __u32           fd;             /* fd for FD_TO_HANDLE          */
+        compat_uptr_t   path;           /* user pathname                */
+        __u32           oflags;         /* open flags                   */
+        compat_uptr_t   ihandle;        /* user supplied handle         */
+        __u32           ihandlen;       /* user supplied length         */
+        compat_uptr_t   ohandle;        /* user buffer for handle       */
+        compat_uptr_t   ohandlen;       /* user buffer length           */
+} compat_xfs_fsop_handlereq_t;
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+        _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+        _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+        _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+        _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+        _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+/* The bstat field in the swapext struct needs translation */
+typedef struct compat_xfs_swapext {
+        __int64_t               sx_version;     /* version */
+        __int64_t               sx_fdtarget;    /* fd of target file */
+        __int64_t               sx_fdtmp;       /* fd of tmp file */
+        xfs_off_t               sx_offset;      /* offset into file */
+        xfs_off_t               sx_length;      /* leng from offset */
+        char                    sx_pad[16];     /* pad space, unused */
+        compat_xfs_bstat_t      sx_stat;        /* stat of target b4 copy */
+} __compat_packed compat_xfs_swapext_t;
+#define XFS_IOC_SWAPEXT_32      _IOWR('X', 109, struct compat_xfs_swapext)
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+        struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+        struct xfs_attrlist_cursor      pos; /* opaque cookie, list offset */
+        __u32                           flags;  /* which namespace to use */
+        __u32                           buflen; /* length of buffer supplied */
+        compat_uptr_t                   buffer; /* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+        _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+        __u32           am_opcode;
+        __s32           am_error;
+        compat_uptr_t   am_attrname;
+        compat_uptr_t   am_attrvalue;
+        __u32           am_length;
+        __u32           am_flags;
+} compat_xfs_attr_multiop_t;
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+        struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+        __u32                           opcount;/* count of following multiop */
+        /* ptr to compat_xfs_attr_multiop */
+        compat_uptr_t                   ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+        _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+typedef struct compat_xfs_fsop_setdm_handlereq {
+        struct compat_xfs_fsop_handlereq hreq;  /* handle information   */
+        /* ptr to struct fsdmidata */
+        compat_uptr_t                   data;   /* DMAPI data   */
+} compat_xfs_fsop_setdm_handlereq_t;
+#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
+        _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
+#ifdef BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct compat_xfs_flock64 {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start __attribute__((packed));
+                        /* len == 0 means until end of file */
+        __s64           l_len __attribute__((packed));
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area */
+} compat_xfs_flock64_t;
+#define XFS_IOC_ALLOCSP_32      _IOW('X', 10, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP_32       _IOW('X', 11, struct compat_xfs_flock64)
+#define XFS_IOC_ALLOCSP64_32    _IOW('X', 36, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP64_32     _IOW('X', 37, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP_32       _IOW('X', 40, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP_32     _IOW('X', 41, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP64_32     _IOW('X', 42, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP64_32   _IOW('X', 43, struct compat_xfs_flock64)
+typedef struct compat_xfs_fsop_geom_v1 {
+        __u32           blocksize;      /* filesystem (data) block size */
+        __u32           rtextsize;      /* realtime extent size         */
+        __u32           agblocks;       /* fsblocks in an AG            */
+        __u32           agcount;        /* number of allocation groups  */
+        __u32           logblocks;      /* fsblocks in the log          */
+        __u32           sectsize;       /* (data) sector size, bytes    */
+        __u32           inodesize;      /* inode size in bytes          */
+        __u32           imaxpct;        /* max allowed inode space(%)   */
+        __u64           datablocks;     /* fsblocks in data subvolume   */
+        __u64           rtblocks;       /* fsblocks in realtime subvol  */
+        __u64           rtextents;      /* rt extents in realtime subvol*/
+        __u64           logstart;       /* starting fsblock of the log  */
+        unsigned char   uuid[16];       /* unique id of the filesystem  */
+        __u32           sunit;          /* stripe unit, fsblocks        */
+        __u32           swidth;         /* stripe width, fsblocks       */
+        __s32           version;        /* structure version            */
+        __u32           flags;          /* superblock version flags     */
+        __u32           logsectsize;    /* log sector size, bytes       */
+        __u32           rtsectsize;     /* realtime sector size, bytes  */
+        __u32           dirblocksize;   /* directory block size, bytes  */
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+#define XFS_IOC_FSGEOMETRY_V1_32  \
+        _IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+typedef struct compat_xfs_inogrp {
+        __u64           xi_startino;    /* starting inode number        */
+        __s32           xi_alloccount;  /* # bits set in allocmask      */
+        __u64           xi_allocmask;   /* mask of allocated inodes     */
+} __attribute__((packed)) compat_xfs_inogrp_t;
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+        __u64           newblocks;      /* new data subvol size, fsblocks */
+        __u32           imaxpct;        /* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+typedef struct compat_xfs_growfs_rt {
+        __u64           newblocks;      /* new realtime size, fsblocks */
+        __u32           extsize;        /* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32   _IOW('X', 112, struct compat_xfs_growfs_rt)
+#endif /* BROKEN_X86_ALIGNMENT */
 #endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 095d271f3434..7aa53fefc67f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -53,6 +53,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/falloc.h>
+#include <linux/fiemap.h>
 /*
 * Bring the atime in the XFS inode uptodate.
@@ -64,14 +65,14 @@ xfs_synchronize_atime(
 {
        struct inode    *inode = VFS_I(ip);
-        if (inode) {
+        if (!(inode->i_state & I_CLEAR)) {
                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
                ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
        }
 }
 /*
- * If the linux inode exists, mark it dirty.
+ * If the linux inode is valid, mark it dirty.
 * Used when commiting a dirty inode into a transaction so that
 * the inode will get written back by the linux code
 */
@@ -81,7 +82,7 @@ xfs_mark_inode_dirty_sync(
 {
        struct inode    *inode = VFS_I(ip);
-        if (inode)
+        if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
                mark_inode_dirty_sync(inode);
 }
@@ -128,7 +129,7 @@ xfs_ichgtime(
        if (sync_it) {
                SYNCHRONIZE();
                ip->i_update_core = 1;
-                mark_inode_dirty_sync(inode);
+                xfs_mark_inode_dirty_sync(ip);
        }
 }
@@ -158,8 +159,6 @@ xfs_init_security(
        }
        error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-        if (!error)
-                xfs_iflags_set(ip, XFS_IMODIFIED);
        kfree(name);
        kfree(value);
@@ -260,7 +259,6 @@ xfs_vn_mknod(
                error = _ACL_INHERIT(inode, mode, default_acl);
                if (unlikely(error))
                        goto out_cleanup_inode;
-                xfs_iflags_set(ip, XFS_IMODIFIED);
                _ACL_FREE(default_acl);
        }
@@ -366,21 +364,17 @@ xfs_vn_link(
        struct inode    *dir,
        struct dentry   *dentry)
 {
-        struct inode    *inode; /* inode of guy being linked to */
+        struct inode    *inode = old_dentry->d_inode;
        struct xfs_name name;
        int             error;
-        inode = old_dentry->d_inode;
        xfs_dentry_to_name(&name, dentry);
-        igrab(inode);
        error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
-        if (unlikely(error)) {
+        if (unlikely(error))
-                iput(inode);
                return -error;
-        }
-        xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+        atomic_inc(&inode->i_count);
        d_instantiate(dentry, inode);
        return 0;
 }
@@ -601,7 +595,7 @@ xfs_vn_setattr(
        struct dentry   *dentry,
        struct iattr    *iattr)
 {
-        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
+        return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 /*
@@ -642,7 +636,7 @@ xfs_vn_fallocate(
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
        error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                      0, NULL, XFS_ATTR_NOLOCK);
+                                      0, XFS_ATTR_NOLOCK);
        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
            offset + len > i_size_read(inode))
                new_size = offset + len;
@@ -653,7 +647,7 @@ xfs_vn_fallocate(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
-                error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL);
+                error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -661,6 +655,88 @@ out_error:
        return error;
 }
+#define XFS_FIEMAP_FLAGS        (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+/*
+ * Call fiemap helper to fill in user data.
+ * Returns positive errors to xfs_getbmap.
+ */
+STATIC int
+xfs_fiemap_format(
+        void                    **arg,
+        struct getbmapx         *bmv,
+        int                     *full)
+{
+        int                     error;
+        struct fiemap_extent_info *fieinfo = *arg;
+        u32                     fiemap_flags = 0;
+        u64                     logical, physical, length;
+        /* Do nothing for a hole */
+        if (bmv->bmv_block == -1LL)
+                return 0;
+        logical = BBTOB(bmv->bmv_offset);
+        physical = BBTOB(bmv->bmv_block);
+        length = BBTOB(bmv->bmv_length);
+        if (bmv->bmv_oflags & BMV_OF_PREALLOC)
+                fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
+        else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
+                fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+                physical = 0;   /* no block yet */
+        }
+        if (bmv->bmv_oflags & BMV_OF_LAST)
+                fiemap_flags |= FIEMAP_EXTENT_LAST;
+        error = fiemap_fill_next_extent(fieinfo, logical, physical,
+                                        length, fiemap_flags);
+        if (error > 0) {
+                error = 0;
+                *full = 1;      /* user array now full */
+        }
+        return -error;
+}
+STATIC int
+xfs_vn_fiemap(
+        struct inode            *inode,
+        struct fiemap_extent_info *fieinfo,
+        u64                     start,
+        u64                     length)
+{
+        xfs_inode_t             *ip = XFS_I(inode);
+        struct getbmapx         bm;
+        int                     error;
+        error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+        if (error)
+                return error;
+        /* Set up bmap header for xfs internal routine */
+        bm.bmv_offset = BTOBB(start);
+        /* Special case for whole file */
+        if (length == FIEMAP_MAX_OFFSET)
+                bm.bmv_length = -1LL;
+        else
+                bm.bmv_length = BTOBB(length);
+        /* our formatter will tell xfs_getbmap when to stop. */
+        bm.bmv_count = MAXEXTNUM;
+        bm.bmv_iflags = BMV_IF_PREALLOC;
+        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
+                bm.bmv_iflags |= BMV_IF_ATTRFORK;
+        if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
+                bm.bmv_iflags |= BMV_IF_DELALLOC;
+        error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
+        if (error)
+                return -error;
+        return 0;
+}
 static const struct inode_operations xfs_inode_operations = {
        .permission             = xfs_vn_permission,
        .truncate               = xfs_vn_truncate,
@@ -671,6 +747,7 @@ static const struct inode_operations xfs_inode_operations = {
        .removexattr            = generic_removexattr,
        .listxattr              = xfs_vn_listxattr,
        .fallocate              = xfs_vn_fallocate,
+        .fiemap                 = xfs_vn_fiemap,
 };
 static const struct inode_operations xfs_dir_inode_operations = {
@@ -766,12 +843,20 @@ xfs_diflags_to_iflags(
 * When reading existing inodes from disk this is called directly
 * from xfs_iget, when creating a new inode it is called from
 * xfs_ialloc after setting up the inode.
+ *
+ * We are always called with an uninitialised linux inode here.
+ * We need to initialise the necessary fields and take a reference
+ * on it.
 */
 void
 xfs_setup_inode(
        struct xfs_inode        *ip)
 {
-        struct inode            *inode = ip->i_vnode;
+        struct inode            *inode = &ip->i_vnode;
+        inode->i_ino = ip->i_ino;
+        inode->i_state = I_NEW|I_LOCK;
+        inode_add_to_lists(ip->i_mount->m_super, inode);
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
@@ -799,7 +884,6 @@ xfs_setup_inode(
        inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
        inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
        xfs_diflags_to_iflags(inode, ip);
-        xfs_iflags_clear(ip, XFS_IMODIFIED);
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index 8b1a1e31dc21..ef41c92ce66e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -22,7 +22,6 @@ struct xfs_inode;
 extern const struct file_operations xfs_file_operations;
 extern const struct file_operations xfs_dir_file_operations;
-extern const struct file_operations xfs_invis_file_operations;
 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index cc0f7b3a9795..507492d6dccd 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -21,18 +21,12 @@
 #include <linux/types.h>
 /*
- * Some types are conditional depending on the target system.
 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well
+ * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
- * as requiring XFS_BIG_BLKNOS to be set.
 */
 #if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
 # define XFS_BIG_BLKNOS 1
-# if BITS_PER_LONG == 64
+# define XFS_BIG_INUMS  1
-#  define XFS_BIG_INUMS 1
-# else
-#  define XFS_BIG_INUMS 0
-# endif
 #else
 # define XFS_BIG_BLKNOS 0
 # define XFS_BIG_INUMS  0
@@ -77,6 +71,7 @@
 #include <linux/spinlock.h>
 #include <linux/random.h>
 #include <linux/ctype.h>
+#include <linux/writeback.h>
 #include <asm/page.h>
 #include <asm/div64.h>
@@ -85,7 +80,6 @@
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
-#include <xfs_vfs.h>
 #include <xfs_cred.h>
 #include <xfs_vnode.h>
 #include <xfs_stats.h>
@@ -107,7 +101,6 @@
 #undef  HAVE_PERCPU_SB  /* per cpu superblock counters are a 2.6 feature */
 #endif
-#define restricted_chown        xfs_params.restrict_chown.val
 #define irix_sgid_inherit       xfs_params.sgid_inherit.val
 #define irix_symlink_mode       xfs_params.symlink_mode.val
 #define xfs_panic_mask          xfs_params.panic_mask.val
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 1957e5357d04..7e90daa0d1d1 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,7 +51,6 @@
 #include "xfs_vnodeops.h"
 #include <linux/capability.h>
-#include <linux/mount.h>
 #include <linux/writeback.h>
@@ -243,7 +242,7 @@ xfs_read(
        if (unlikely(ioflags & IO_ISDIRECT)) {
                if (inode->i_mapping->nrpages)
-                        ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
+                        ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
                                                    -1, FI_REMAPF_LOCKED);
                mutex_unlock(&inode->i_mutex);
                if (ret) {
@@ -668,15 +667,8 @@ start:
        if (new_size > xip->i_size)
                xip->i_new_size = new_size;
-        /*
+        if (likely(!(ioflags & IO_INVIS)))
-         * We're not supposed to change timestamps in readonly-mounted
-         * filesystems.  Throw it away if anyone asks us.
-         */
-        if (likely(!(ioflags & IO_INVIS) &&
-                   !mnt_want_write(file->f_path.mnt))) {
                xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-                mnt_drop_write(file->f_path.mnt);
-        }
        /*
         * If the offset is beyond the size of the file, we have a couple
@@ -715,7 +707,6 @@ start:
                }
        }
-retry:
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
@@ -771,6 +762,17 @@ retry:
        if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
                ret = wait_on_sync_kiocb(iocb);
+        isize = i_size_read(inode);
+        if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
+                *offset = isize;
+        if (*offset > xip->i_size) {
+                xfs_ilock(xip, XFS_ILOCK_EXCL);
+                if (*offset > xip->i_size)
+                        xip->i_size = *offset;
+                xfs_iunlock(xip, XFS_ILOCK_EXCL);
+        }
        if (ret == -ENOSPC &&
            DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
                xfs_iunlock(xip, iolock);
@@ -784,20 +786,7 @@ retry:
                xfs_ilock(xip, iolock);
                if (error)
                        goto out_unlock_internal;
-                pos = xip->i_size;
+                goto start;
-                ret = 0;
-                goto retry;
-        }
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
-                *offset = isize;
-        if (*offset > xip->i_size) {
-                xfs_ilock(xip, XFS_ILOCK_EXCL);
-                if (*offset > xip->i_size)
-                        xip->i_size = *offset;
-                xfs_iunlock(xip, XFS_ILOCK_EXCL);
        }
        error = -ret;
@@ -855,13 +844,7 @@ retry:
 int
 xfs_bdstrat_cb(struct xfs_buf *bp)
 {
-        xfs_mount_t     *mp;
+        if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
-        mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
-        if (!XFS_FORCED_SHUTDOWN(mp)) {
-                xfs_buf_iorequest(bp);
-                return 0;
-        } else {
                xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
                /*
                 * Metadata write that didn't get logged but
@@ -874,6 +857,9 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
                else
                        return (xfs_bioerror(bp));
        }
+        xfs_buf_iorequest(bp);
+        return 0;
 }
 /*
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 3d5b67c075c7..c3526d445f6a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -53,11 +53,15 @@ xfs_read_xfsstats(
                { "icluster",           XFSSTAT_END_INODE_CLUSTER       },
                { "vnodes",             XFSSTAT_END_VNODE_OPS           },
                { "buf",                XFSSTAT_END_BUF                 },
+                { "abtb2",              XFSSTAT_END_ABTB_V2             },
+                { "abtc2",              XFSSTAT_END_ABTC_V2             },
+                { "bmbt2",              XFSSTAT_END_BMBT_V2             },
+                { "ibt2",               XFSSTAT_END_IBT_V2              },
        };
        /* Loop over all stats groups */
        for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
-                len += sprintf(buffer + len, xstats[i].desc);
+                len += sprintf(buffer + len, "%s", xstats[i].desc);
                /* inner loop does each group */
                while (j < xstats[i].endpoint) {
                        val = 0;
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index e83820febc9f..736854b1ca1a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -118,6 +118,71 @@ struct xfsstats {
        __uint32_t              xb_page_retries;
        __uint32_t              xb_page_found;
        __uint32_t              xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2             (XFSSTAT_END_BUF+15)
+        __uint32_t              xs_abtb_2_lookup;
+        __uint32_t              xs_abtb_2_compare;
+        __uint32_t              xs_abtb_2_insrec;
+        __uint32_t              xs_abtb_2_delrec;
+        __uint32_t              xs_abtb_2_newroot;
+        __uint32_t              xs_abtb_2_killroot;
+        __uint32_t              xs_abtb_2_increment;
+        __uint32_t              xs_abtb_2_decrement;
+        __uint32_t              xs_abtb_2_lshift;
+        __uint32_t              xs_abtb_2_rshift;
+        __uint32_t              xs_abtb_2_split;
+        __uint32_t              xs_abtb_2_join;
+        __uint32_t              xs_abtb_2_alloc;
+        __uint32_t              xs_abtb_2_free;
+        __uint32_t              xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2             (XFSSTAT_END_ABTB_V2+15)
+        __uint32_t              xs_abtc_2_lookup;
+        __uint32_t              xs_abtc_2_compare;
+        __uint32_t              xs_abtc_2_insrec;
+        __uint32_t              xs_abtc_2_delrec;
+        __uint32_t              xs_abtc_2_newroot;
+        __uint32_t              xs_abtc_2_killroot;
+        __uint32_t              xs_abtc_2_increment;
+        __uint32_t              xs_abtc_2_decrement;
+        __uint32_t              xs_abtc_2_lshift;
+        __uint32_t              xs_abtc_2_rshift;
+        __uint32_t              xs_abtc_2_split;
+        __uint32_t              xs_abtc_2_join;
+        __uint32_t              xs_abtc_2_alloc;
+        __uint32_t              xs_abtc_2_free;
+        __uint32_t              xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2             (XFSSTAT_END_ABTC_V2+15)
+        __uint32_t              xs_bmbt_2_lookup;
+        __uint32_t              xs_bmbt_2_compare;
+        __uint32_t              xs_bmbt_2_insrec;
+        __uint32_t              xs_bmbt_2_delrec;
+        __uint32_t              xs_bmbt_2_newroot;
+        __uint32_t              xs_bmbt_2_killroot;
+        __uint32_t              xs_bmbt_2_increment;
+        __uint32_t              xs_bmbt_2_decrement;
+        __uint32_t              xs_bmbt_2_lshift;
+        __uint32_t              xs_bmbt_2_rshift;
+        __uint32_t              xs_bmbt_2_split;
+        __uint32_t              xs_bmbt_2_join;
+        __uint32_t              xs_bmbt_2_alloc;
+        __uint32_t              xs_bmbt_2_free;
+        __uint32_t              xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2              (XFSSTAT_END_BMBT_V2+15)
+        __uint32_t              xs_ibt_2_lookup;
+        __uint32_t              xs_ibt_2_compare;
+        __uint32_t              xs_ibt_2_insrec;
+        __uint32_t              xs_ibt_2_delrec;
+        __uint32_t              xs_ibt_2_newroot;
+        __uint32_t              xs_ibt_2_killroot;
+        __uint32_t              xs_ibt_2_increment;
+        __uint32_t              xs_ibt_2_decrement;
+        __uint32_t              xs_ibt_2_lshift;
+        __uint32_t              xs_ibt_2_rshift;
+        __uint32_t              xs_ibt_2_split;
+        __uint32_t              xs_ibt_2_join;
+        __uint32_t              xs_ibt_2_alloc;
+        __uint32_t              xs_ibt_2_free;
+        __uint32_t              xs_ibt_2_moves;
 /* Extra precision counters */
        __uint64_t              xs_xstrat_bytes;
        __uint64_t              xs_write_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 37ebe36056eb..95a971080368 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -18,7 +18,6 @@
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
-#include "xfs_clnt.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
@@ -36,6 +35,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -48,7 +48,6 @@
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
 #include "xfs_version.h"
 #include "xfs_log_priv.h"
 #include "xfs_trans_priv.h"
@@ -58,6 +57,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
+#include "xfs_sync.h"
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -70,36 +70,9 @@
 static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_vnode_zone;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
-STATIC struct xfs_mount_args *
-xfs_args_allocate(
-        struct super_block      *sb,
-        int                     silent)
-{
-        struct xfs_mount_args   *args;
-        args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
-        if (!args)
-                return NULL;
-        args->logbufs = args->logbufsize = -1;
-        strncpy(args->fsname, sb->s_id, MAXNAMELEN);
-        /* Copy the already-parsed mount(2) flags we're interested in */
-        if (sb->s_flags & MS_DIRSYNC)
-                args->flags |= XFSMNT_DIRSYNC;
-        if (sb->s_flags & MS_SYNCHRONOUS)
-                args->flags |= XFSMNT_WSYNC;
-        if (silent)
-                args->flags |= XFSMNT_QUIET;
-        args->flags |= XFSMNT_32BITINODES;
-        return args;
-}
 #define MNTOPT_LOGBUFS  "logbufs"       /* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE "logbsize"      /* size of XFS log buffers */
 #define MNTOPT_LOGDEV   "logdev"        /* log device */
@@ -188,26 +161,54 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
        return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
 }
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure.  The caller takes care of them.
+ */
 STATIC int
 xfs_parseargs(
        struct xfs_mount        *mp,
        char                    *options,
-        struct xfs_mount_args   *args,
+        char                    **mtpt)
-        int                     update)
 {
+        struct super_block      *sb = mp->m_super;
        char                    *this_char, *value, *eov;
-        int                     dsunit, dswidth, vol_dsunit, vol_dswidth;
+        int                     dsunit = 0;
-        int                     iosize;
+        int                     dswidth = 0;
+        int                     iosize = 0;
        int                     dmapi_implies_ikeep = 1;
+        uchar_t                 iosizelog = 0;
+        /*
+         * Copy binary VFS mount flags we are interested in.
+         */
+        if (sb->s_flags & MS_RDONLY)
+                mp->m_flags |= XFS_MOUNT_RDONLY;
+        if (sb->s_flags & MS_DIRSYNC)
+                mp->m_flags |= XFS_MOUNT_DIRSYNC;
+        if (sb->s_flags & MS_SYNCHRONOUS)
+                mp->m_flags |= XFS_MOUNT_WSYNC;
+        /*
+         * Set some default flags that could be cleared by the mount option
+         * parsing.
+         */
+        mp->m_flags |= XFS_MOUNT_BARRIER;
+        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-        args->flags |= XFSMNT_BARRIER;
+        /*
-        args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+         * These can be overridden by the mount option parsing.
+         */
+        mp->m_logbufs = -1;
+        mp->m_logbsize = -1;
        if (!options)
                goto done;
-        iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
        while ((this_char = strsep(&options, ",")) != NULL) {
                if (!*this_char)
                        continue;
@@ -221,7 +222,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        args->logbufs = simple_strtoul(value, &eov, 10);
+                        mp->m_logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -229,7 +230,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        args->logbufsize = suffix_strtoul(value, &eov, 10);
+                        mp->m_logbsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -237,7 +238,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        strncpy(args->logname, value, MAXNAMELEN);
+                        mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                        if (!mp->m_logname)
+                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -245,7 +248,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        strncpy(args->mtpt, value, MAXNAMELEN);
+                        *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                        if (!*mtpt)
+                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -253,7 +258,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        strncpy(args->rtname, value, MAXNAMELEN);
+                        mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                        if (!mp->m_rtname)
+                                return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -262,8 +269,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = simple_strtoul(value, &eov, 10);
-                        args->flags |= XFSMNT_IOSIZE;
+                        iosizelog = ffs(iosize) - 1;
-                        args->iosizelog = (uint8_t) iosize;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -272,8 +278,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = suffix_strtoul(value, &eov, 10);
-                        args->flags |= XFSMNT_IOSIZE;
+                        iosizelog = ffs(iosize) - 1;
-                        args->iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
                        mp->m_flags |= XFS_MOUNT_GRPID;
@@ -281,23 +286,25 @@ xfs_parseargs(
                           !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
                        mp->m_flags &= ~XFS_MOUNT_GRPID;
                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
-                        args->flags |= XFSMNT_WSYNC;
+                        mp->m_flags |= XFS_MOUNT_WSYNC;
                } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
-                        args->flags |= XFSMNT_OSYNCISOSYNC;
+                        mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
-                        args->flags |= XFSMNT_NORECOVERY;
+                        mp->m_flags |= XFS_MOUNT_NORECOVERY;
                } else if (!strcmp(this_char, MNTOPT_INO64)) {
-                        args->flags |= XFSMNT_INO64;
+#if XFS_BIG_INUMS
-#if !XFS_BIG_INUMS
+                        mp->m_flags |= XFS_MOUNT_INO64;
+                        mp->m_inoadd = XFS_INO64_OFFSET;
+#else
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
-                        args->flags |= XFSMNT_NOALIGN;
+                        mp->m_flags |= XFS_MOUNT_NOALIGN;
                } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
-                        args->flags |= XFSMNT_SWALLOC;
+                        mp->m_flags |= XFS_MOUNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -315,7 +322,7 @@ xfs_parseargs(
                        }
                        dswidth = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
-                        args->flags &= ~XFSMNT_32BITINODES;
+                        mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
@@ -323,56 +330,61 @@ xfs_parseargs(
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
-                        args->flags |= XFSMNT_NOUUID;
+                        mp->m_flags |= XFS_MOUNT_NOUUID;
                } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
-                        args->flags |= XFSMNT_BARRIER;
+                        mp->m_flags |= XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
-                        args->flags &= ~XFSMNT_BARRIER;
+                        mp->m_flags &= ~XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
-                        args->flags |= XFSMNT_IKEEP;
+                        mp->m_flags |= XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
                        dmapi_implies_ikeep = 0;
-                        args->flags &= ~XFSMNT_IKEEP;
+                        mp->m_flags &= ~XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
-                        args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE;
+                        mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
-                        args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+                        mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
-                        args->flags |= XFSMNT_ATTR2;
+                        mp->m_flags |= XFS_MOUNT_ATTR2;
                } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
-                        args->flags &= ~XFSMNT_ATTR2;
+                        mp->m_flags &= ~XFS_MOUNT_ATTR2;
-                        args->flags |= XFSMNT_NOATTR2;
+                        mp->m_flags |= XFS_MOUNT_NOATTR2;
                } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
-                        args->flags2 |= XFSMNT2_FILESTREAMS;
+                        mp->m_flags |= XFS_MOUNT_FILESTREAMS;
                } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
-                        args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
+                        mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
-                        args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
+                                          XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                          XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                          XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
                           !strcmp(this_char, MNTOPT_UQUOTA) ||
                           !strcmp(this_char, MNTOPT_USRQUOTA)) {
-                        args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+                        mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                                         XFS_UQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
                           !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
-                        args->flags |= XFSMNT_UQUOTA;
+                        mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-                        args->flags &= ~XFSMNT_UQUOTAENF;
+                        mp->m_qflags &= ~XFS_UQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
                           !strcmp(this_char, MNTOPT_PRJQUOTA)) {
-                        args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF;
+                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                         XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
-                        args->flags |= XFSMNT_PQUOTA;
+                        mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                        args->flags &= ~XFSMNT_PQUOTAENF;
+                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
                           !strcmp(this_char, MNTOPT_GRPQUOTA)) {
-                        args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
+                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                         XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
-                        args->flags |= XFSMNT_GQUOTA;
+                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                        args->flags &= ~XFSMNT_GQUOTAENF;
+                        mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
-                        args->flags |= XFSMNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_XDSM)) {
-                        args->flags |= XFSMNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_DMI)) {
-                        args->flags |= XFSMNT_DMAPI;
+                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, "ihashsize")) {
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
@@ -390,27 +402,29 @@ xfs_parseargs(
                }
        }
-        if (args->flags & XFSMNT_NORECOVERY) {
+        /*
-                if ((mp->m_flags & XFS_MOUNT_RDONLY) == 0) {
+         * no recovery flag requires a read-only mount
-                        cmn_err(CE_WARN,
+         */
-                                "XFS: no-recovery mounts must be read-only.");
+        if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
-                        return EINVAL;
+            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                }
+                cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+                return EINVAL;
        }
-        if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
+        if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
                cmn_err(CE_WARN,
        "XFS: sunit and swidth options incompatible with the noalign option");
                return EINVAL;
        }
-        if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) {
+        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+            (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
                cmn_err(CE_WARN,
                        "XFS: cannot mount with both project and group quota");
                return EINVAL;
        }
-        if ((args->flags & XFSMNT_DMAPI) && *args->mtpt == '\0') {
+        if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
                printk("XFS: %s option needs the mount point option as well\n",
                        MNTOPT_DMAPI);
                return EINVAL;
@@ -438,27 +452,66 @@ xfs_parseargs(
         * Note that if "ikeep" or "noikeep" mount options are
         * supplied, then they are honored.
         */
-        if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep)
+        if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
-                args->flags |= XFSMNT_IKEEP;
+                mp->m_flags |= XFS_MOUNT_IKEEP;
-        if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+done:
+        if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+                /*
+                 * At this point the superblock has not been read
+                 * in, therefore we do not know the block size.
+                 * Before the mount call ends we will convert
+                 * these to FSBs.
+                 */
                if (dsunit) {
-                        args->sunit = dsunit;
+                        mp->m_dalign = dsunit;
-                        args->flags |= XFSMNT_RETERR;
+                        mp->m_flags |= XFS_MOUNT_RETERR;
-                } else {
-                        args->sunit = vol_dsunit;
                }
-                dswidth ? (args->swidth = dswidth) :
-                          (args->swidth = vol_dswidth);
+                if (dswidth)
-        } else {
+                        mp->m_swidth = dswidth;
-                args->sunit = args->swidth = 0;
+        }
+        if (mp->m_logbufs != -1 &&
+            mp->m_logbufs != 0 &&
+            (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+             mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+                cmn_err(CE_WARN,
+                        "XFS: invalid logbufs value: %d [not %d-%d]",
+                        mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+                return XFS_ERROR(EINVAL);
+        }
+        if (mp->m_logbsize != -1 &&
+            mp->m_logbsize !=  0 &&
+            (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+             mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+             !is_power_of_2(mp->m_logbsize))) {
+                cmn_err(CE_WARN,
+        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                        mp->m_logbsize);
+                return XFS_ERROR(EINVAL);
+        }
+        mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+        if (!mp->m_fsname)
+                return ENOMEM;
+        mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+        if (iosizelog) {
+                if (iosizelog > XFS_MAX_IO_LOG ||
+                    iosizelog < XFS_MIN_IO_LOG) {
+                        cmn_err(CE_WARN,
+                "XFS: invalid log iosize: %d [not %d-%d]",
+                                iosizelog, XFS_MIN_IO_LOG,
+                                XFS_MAX_IO_LOG);
+                        return XFS_ERROR(EINVAL);
+                }
+                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+                mp->m_readio_log = iosizelog;
+                mp->m_writeio_log = iosizelog;
        }
-done:
-        if (args->flags & XFSMNT_32BITINODES)
-                mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-        if (args->flags2)
-                args->flags |= XFSMNT_FLAGS2;
        return 0;
 }
@@ -704,8 +757,7 @@ xfs_close_devices(
 */
 STATIC int
 xfs_open_devices(
-        struct xfs_mount        *mp,
+        struct xfs_mount        *mp)
-        struct xfs_mount_args   *args)
 {
        struct block_device     *ddev = mp->m_super->s_bdev;
        struct block_device     *logdev = NULL, *rtdev = NULL;
@@ -714,14 +766,14 @@ xfs_open_devices(
        /*
         * Open real time and log devices - order is important.
         */
-        if (args->logname[0]) {
+        if (mp->m_logname) {
-                error = xfs_blkdev_get(mp, args->logname, &logdev);
+                error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
                if (error)
                        goto out;
        }
-        if (args->rtname[0]) {
+        if (mp->m_rtname) {
-                error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+                error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
                if (error)
                        goto out_close_logdev;
@@ -813,18 +865,18 @@ xfs_setup_devices(
 */
 void
 xfsaild_wakeup(
-        xfs_mount_t             *mp,
+        struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-        mp->m_ail.xa_target = threshold_lsn;
+        ailp->xa_target = threshold_lsn;
-        wake_up_process(mp->m_ail.xa_task);
+        wake_up_process(ailp->xa_task);
 }
 int
 xfsaild(
        void    *data)
 {
-        xfs_mount_t     *mp = (xfs_mount_t *)data;
+        struct xfs_ail  *ailp = data;
        xfs_lsn_t       last_pushed_lsn = 0;
        long            tout = 0;
@@ -836,11 +888,11 @@ xfsaild(
                /* swsusp */
                try_to_freeze();
-                ASSERT(mp->m_log);
+                ASSERT(ailp->xa_mount->m_log);
-                if (XFS_FORCED_SHUTDOWN(mp))
+                if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
                        continue;
-                tout = xfsaild_push(mp, &last_pushed_lsn);
+                tout = xfsaild_push(ailp, &last_pushed_lsn);
        }
        return 0;
@@ -848,43 +900,82 @@ xfsaild(
 int
 xfsaild_start(
-        xfs_mount_t     *mp)
+        struct xfs_ail  *ailp)
 {
-        mp->m_ail.xa_target = 0;
+        ailp->xa_target = 0;
-        mp->m_ail.xa_task = kthread_run(xfsaild, mp, "xfsaild");
+        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
-        if (IS_ERR(mp->m_ail.xa_task))
+        if (IS_ERR(ailp->xa_task))
-                return -PTR_ERR(mp->m_ail.xa_task);
+                return -PTR_ERR(ailp->xa_task);
        return 0;
 }
 void
 xfsaild_stop(
-        xfs_mount_t     *mp)
+        struct xfs_ail  *ailp)
 {
-        kthread_stop(mp->m_ail.xa_task);
+        kthread_stop(ailp->xa_task);
 }
+/* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
        struct super_block      *sb)
 {
-        return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
+        BUG();
+        return NULL;
 }
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
 STATIC void
 xfs_fs_destroy_inode(
-        struct inode            *inode)
+        struct inode    *inode)
 {
-        kmem_zone_free(xfs_vnode_zone, inode);
+        xfs_inode_t             *ip = XFS_I(inode);
+        XFS_STATS_INC(vn_reclaim);
+        if (xfs_reclaim(ip))
+                panic("%s: cannot reclaim 0x%p\n", __func__, inode);
 }
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly intialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
 STATIC void
 xfs_fs_inode_init_once(
-        void                    *vnode)
+        void                    *inode)
 {
-        inode_init_once((struct inode *)vnode);
+        struct xfs_inode        *ip = inode;
+        memset(ip, 0, sizeof(struct xfs_inode));
+        /* vfs inode */
+        inode_init_once(VFS_I(ip));
+        /* xfs inode */
+        atomic_set(&ip->i_iocount, 0);
+        atomic_set(&ip->i_pincount, 0);
+        spin_lock_init(&ip->i_flags_lock);
+        init_waitqueue_head(&ip->i_ipin_wait);
+        /*
+         * Because we want to use a counting completion, complete
+         * the flush completion once to allow a single access to
+         * the flush completion without blocking.
+         */
+        init_completion(&ip->i_flush);
+        complete(&ip->i_flush);
+        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                     "xfsino", ip->i_ino);
+        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 }
 /*
@@ -898,21 +989,26 @@ xfs_fs_write_inode(
        struct inode            *inode,
        int                     sync)
 {
+        struct xfs_inode        *ip = XFS_I(inode);
        int                     error = 0;
        int                     flags = 0;
-        xfs_itrace_entry(XFS_I(inode));
+        xfs_itrace_entry(ip);
        if (sync) {
-                filemap_fdatawait(inode->i_mapping);
+                error = xfs_wait_on_pages(ip, 0, -1);
+                if (error)
+                        goto out_error;
                flags |= FLUSH_SYNC;
        }
-        error = xfs_inode_flush(XFS_I(inode), flags);
+        error = xfs_inode_flush(ip, flags);
+out_error:
        /*
         * if we failed to write out the inode then mark
         * it dirty again so we'll try again later.
         */
        if (error)
-                mark_inode_dirty_sync(inode);
+                xfs_mark_inode_dirty_sync(ip);
        return -error;
 }
@@ -923,164 +1019,12 @@ xfs_fs_clear_inode(
 {
        xfs_inode_t             *ip = XFS_I(inode);
-        /*
+        xfs_itrace_entry(ip);
-         * ip can be null when xfs_iget_core calls xfs_idestroy if we
+        XFS_STATS_INC(vn_rele);
-         * find an inode with di_mode == 0 but without IGET_CREATE set.
+        XFS_STATS_INC(vn_remove);
-         */
+        XFS_STATS_DEC(vn_active);
-        if (ip) {
-                xfs_itrace_entry(ip);
-                XFS_STATS_INC(vn_rele);
-                XFS_STATS_INC(vn_remove);
-                XFS_STATS_INC(vn_reclaim);
-                XFS_STATS_DEC(vn_active);
-                xfs_inactive(ip);
-                xfs_iflags_clear(ip, XFS_IMODIFIED);
-                if (xfs_reclaim(ip))
-                        panic("%s: cannot reclaim 0x%p\n", __func__, inode);
-        }
-        ASSERT(XFS_I(inode) == NULL);
-}
-/*
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
- * Doing this has two advantages:
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-        struct xfs_mount *mp,
-        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *))
-{
-        struct bhv_vfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
-        INIT_LIST_HEAD(&work->w_list);
-        work->w_syncer = syncer;
-        work->w_data = data;
-        work->w_mount = mp;
-        spin_lock(&mp->m_sync_lock);
-        list_add_tail(&work->w_list, &mp->m_sync_list);
-        spin_unlock(&mp->m_sync_lock);
-        wake_up_process(mp->m_sync_task);
-}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inode_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        filemap_flush(inode->i_mapping);
-        iput(inode);
-}
-void
-xfs_flush_inode(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-        delay(msecs_to_jiffies(500));
-}
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        sync_blockdev(mp->m_super->s_bdev);
-        iput(inode);
-}
-void
-xfs_flush_device(
-        xfs_inode_t     *ip)
-{
-        struct inode    *inode = VFS_I(ip);
-        igrab(inode);
+        xfs_inactive(ip);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
-        delay(msecs_to_jiffies(500));
-        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
-}
-STATIC void
-xfs_sync_worker(
-        struct xfs_mount *mp,
-        void            *unused)
-{
-        int             error;
-        if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-                error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
-        mp->m_sync_seq++;
-        wake_up(&mp->m_wait_single_sync_task);
-}
-STATIC int
-xfssyncd(
-        void                    *arg)
-{
-        struct xfs_mount        *mp = arg;
-        long                    timeleft;
-        bhv_vfs_sync_work_t     *work, *n;
-        LIST_HEAD               (tmp);
-        set_freezable();
-        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-        for (;;) {
-                timeleft = schedule_timeout_interruptible(timeleft);
-                /* swsusp */
-                try_to_freeze();
-                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                        break;
-                spin_lock(&mp->m_sync_lock);
-                /*
-                 * We can get woken by laptop mode, to do a sync -
-                 * that's the (only!) case where the list would be
-                 * empty with time remaining.
-                 */
-                if (!timeleft || list_empty(&mp->m_sync_list)) {
-                        if (!timeleft)
-                                timeleft = xfs_syncd_centisecs *
-                                                        msecs_to_jiffies(10);
-                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                        list_add_tail(&mp->m_sync_work.w_list,
-                                        &mp->m_sync_list);
-                }
-                list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
-                        list_move(&work->w_list, &tmp);
-                spin_unlock(&mp->m_sync_lock);
-                list_for_each_entry_safe(work, n, &tmp, w_list) {
-                        (*work->w_syncer)(mp, work->w_data);
-                        list_del(&work->w_list);
-                        if (work == &mp->m_sync_work)
-                                continue;
-                        kmem_free(work);
-                }
-        }
-        return 0;
 }
 STATIC void
@@ -1099,11 +1043,9 @@ xfs_fs_put_super(
        struct xfs_mount        *mp = XFS_M(sb);
        struct xfs_inode        *rip = mp->m_rootip;
        int                     unmount_event_flags = 0;
-        int                     error;
-        kthread_stop(mp->m_sync_task);
-        xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI);
+        xfs_syncd_stop(mp);
+        xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
 #ifdef HAVE_DMAPI
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1128,18 +1070,6 @@ xfs_fs_put_super(
        xfs_filestream_unmount(mp);
        XFS_bflush(mp->m_ddev_targp);
-        error = xfs_unmount_flush(mp, 0);
-        WARN_ON(error);
-        /*
-         * If we're forcing a shutdown, typically because of a media error,
-         * we want to make sure we invalidate dirty pages that belong to
-         * referenced vnodes as well.
-         */
-        if (XFS_FORCED_SHUTDOWN(mp)) {
-                error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
-                ASSERT(error != EFSCORRUPTED);
-        }
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
@@ -1161,7 +1091,7 @@ xfs_fs_write_super(
        struct super_block      *sb)
 {
        if (!(sb->s_flags & MS_RDONLY))
-                xfs_sync(XFS_M(sb), SYNC_FSDATA);
+                xfs_sync_fsdata(XFS_M(sb), 0);
        sb->s_dirt = 0;
 }
@@ -1172,7 +1102,6 @@ xfs_fs_sync_super(
 {
        struct xfs_mount        *mp = XFS_M(sb);
        int                     error;
-        int                     flags;
        /*
         * Treat a sync operation like a freeze.  This is to work
@@ -1186,20 +1115,10 @@ xfs_fs_sync_super(
         * dirty the Linux inode until after the transaction I/O
         * completes.
         */
-        if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) {
+        if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
-                /*
+                error = xfs_quiesce_data(mp);
-                 * First stage of freeze - no more writers will make progress
+        else
-                 * now we are here, so we flush delwri and delalloc buffers
+                error = xfs_sync_fsdata(mp, 0);
-                 * here, then wait for all I/O to complete.  Data is frozen at
-                 * that point. Metadata is not frozen, transactions can still
-                 * occur here so don't bother flushing the buftarg (i.e
-                 * SYNC_QUIESCE) because it'll just get dirty again.
-                 */
-                flags = SYNC_DATA_QUIESCE;
-        } else
-                flags = SYNC_FSDATA;
-        error = xfs_sync(mp, flags);
        sb->s_dirt = 0;
        if (unlikely(laptop_mode)) {
@@ -1337,9 +1256,8 @@ xfs_fs_remount(
        /* rw -> ro */
        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
-                xfs_filestream_flush(mp);
+                xfs_quiesce_data(mp);
-                xfs_sync(mp, SYNC_DATA_QUIESCE);
+                xfs_quiesce_attr(mp);
-                xfs_attr_quiesce(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
        }
@@ -1348,17 +1266,17 @@ xfs_fs_remount(
 /*
 * Second stage of a freeze. The data is already frozen so we only
- * need to take care of themetadata. Once that's done write a dummy
+ * need to take care of the metadata. Once that's done write a dummy
 * record to dirty the log in case of a crash while frozen.
 */
-STATIC void
+STATIC int
-xfs_fs_lockfs(
+xfs_fs_freeze(
        struct super_block      *sb)
 {
        struct xfs_mount        *mp = XFS_M(sb);
-        xfs_attr_quiesce(mp);
+        xfs_quiesce_attr(mp);
-        xfs_fs_log_dummy(mp);
+        return -xfs_fs_log_dummy(mp);
 }
 STATIC int
@@ -1422,175 +1340,28 @@ xfs_fs_setxquota(
 /*
 * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- */
-STATIC int
-xfs_start_flags(
-        struct xfs_mount_args   *ap,
-        struct xfs_mount        *mp)
-{
-        int                     error;
-        /* Values are in BBs */
-        if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
-                /*
-                 * At this point the superblock has not been read
-                 * in, therefore we do not know the block size.
-                 * Before the mount call ends we will convert
-                 * these to FSBs.
-                 */
-                mp->m_dalign = ap->sunit;
-                mp->m_swidth = ap->swidth;
-        }
-        if (ap->logbufs != -1 &&
-            ap->logbufs != 0 &&
-            (ap->logbufs < XLOG_MIN_ICLOGS ||
-             ap->logbufs > XLOG_MAX_ICLOGS)) {
-                cmn_err(CE_WARN,
-                        "XFS: invalid logbufs value: %d [not %d-%d]",
-                        ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-                return XFS_ERROR(EINVAL);
-        }
-        mp->m_logbufs = ap->logbufs;
-        if (ap->logbufsize != -1 &&
-            ap->logbufsize !=  0 &&
-            (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
-             ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
-             !is_power_of_2(ap->logbufsize))) {
-                cmn_err(CE_WARN,
-        "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
-                        ap->logbufsize);
-                return XFS_ERROR(EINVAL);
-        }
-        error = ENOMEM;
-        mp->m_logbsize = ap->logbufsize;
-        mp->m_fsname_len = strlen(ap->fsname) + 1;
-        mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
-        if (!mp->m_fsname)
-                goto out;
-        if (ap->rtname[0]) {
-                mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
-                if (!mp->m_rtname)
-                        goto out_free_fsname;
-        }
-        if (ap->logname[0]) {
-                mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
-                if (!mp->m_logname)
-                        goto out_free_rtname;
-        }
-        if (ap->flags & XFSMNT_WSYNC)
-                mp->m_flags |= XFS_MOUNT_WSYNC;
-#if XFS_BIG_INUMS
-        if (ap->flags & XFSMNT_INO64) {
-                mp->m_flags |= XFS_MOUNT_INO64;
-                mp->m_inoadd = XFS_INO64_OFFSET;
-        }
-#endif
-        if (ap->flags & XFSMNT_RETERR)
-                mp->m_flags |= XFS_MOUNT_RETERR;
-        if (ap->flags & XFSMNT_NOALIGN)
-                mp->m_flags |= XFS_MOUNT_NOALIGN;
-        if (ap->flags & XFSMNT_SWALLOC)
-                mp->m_flags |= XFS_MOUNT_SWALLOC;
-        if (ap->flags & XFSMNT_OSYNCISOSYNC)
-                mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
-        if (ap->flags & XFSMNT_32BITINODES)
-                mp->m_flags |= XFS_MOUNT_32BITINODES;
-        if (ap->flags & XFSMNT_IOSIZE) {
-                if (ap->iosizelog > XFS_MAX_IO_LOG ||
-                    ap->iosizelog < XFS_MIN_IO_LOG) {
-                        cmn_err(CE_WARN,
-                "XFS: invalid log iosize: %d [not %d-%d]",
-                                ap->iosizelog, XFS_MIN_IO_LOG,
-                                XFS_MAX_IO_LOG);
-                        return XFS_ERROR(EINVAL);
-                }
-                mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
-                mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
-        }
-        if (ap->flags & XFSMNT_IKEEP)
-                mp->m_flags |= XFS_MOUNT_IKEEP;
-        if (ap->flags & XFSMNT_DIRSYNC)
-                mp->m_flags |= XFS_MOUNT_DIRSYNC;
-        if (ap->flags & XFSMNT_ATTR2)
-                mp->m_flags |= XFS_MOUNT_ATTR2;
-        if (ap->flags & XFSMNT_NOATTR2)
-                mp->m_flags |= XFS_MOUNT_NOATTR2;
-        if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
-                mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-        /*
-         * no recovery flag requires a read-only mount
-         */
-        if (ap->flags & XFSMNT_NORECOVERY) {
-                if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                        cmn_err(CE_WARN,
-        "XFS: tried to mount a FS read-write without recovery!");
-                        return XFS_ERROR(EINVAL);
-                }
-                mp->m_flags |= XFS_MOUNT_NORECOVERY;
-        }
-        if (ap->flags & XFSMNT_NOUUID)
-                mp->m_flags |= XFS_MOUNT_NOUUID;
-        if (ap->flags & XFSMNT_BARRIER)
-                mp->m_flags |= XFS_MOUNT_BARRIER;
-        else
-                mp->m_flags &= ~XFS_MOUNT_BARRIER;
-        if (ap->flags2 & XFSMNT2_FILESTREAMS)
-                mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-        if (ap->flags & XFSMNT_DMAPI)
-                mp->m_flags |= XFS_MOUNT_DMAPI;
-        return 0;
- out_free_rtname:
-        kfree(mp->m_rtname);
- out_free_fsname:
-        kfree(mp->m_fsname);
- out:
-        return error;
-}
-/*
- * This function fills in xfs_mount_t fields based on mount args.
 * Note: the superblock _has_ now been read in.
 */
 STATIC int
 xfs_finish_flags(
-        struct xfs_mount_args   *ap,
        struct xfs_mount        *mp)
 {
        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
-        /* Fail a mount where the logbuf is smaller then the log stripe */
+        /* Fail a mount where the logbuf is smaller than the log stripe */
        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-                if ((ap->logbufsize <= 0) &&
+                if (mp->m_logbsize <= 0 &&
-                    (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+                    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
-                } else if (ap->logbufsize > 0 &&
+                } else if (mp->m_logbsize > 0 &&
-                           ap->logbufsize < mp->m_sb.sb_logsunit) {
+                           mp->m_logbsize < mp->m_sb.sb_logsunit) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size must be greater than or equal to log stripe size");
                        return XFS_ERROR(EINVAL);
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
-                if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+                if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size for version 1 logs must be 16K or 32K");
                        return XFS_ERROR(EINVAL);
@@ -1602,7 +1373,7 @@ xfs_finish_flags(
         * told by noattr2 to turn it off
         */
        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
-            !(ap->flags & XFSMNT_NOATTR2))
+            !(mp->m_flags & XFS_MOUNT_NOATTR2))
                mp->m_flags |= XFS_MOUNT_ATTR2;
        /*
@@ -1614,48 +1385,6 @@ xfs_finish_flags(
                return XFS_ERROR(EROFS);
        }
-        /*
-         * check for shared mount.
-         */
-        if (ap->flags & XFSMNT_SHARED) {
-                if (!xfs_sb_version_hasshared(&mp->m_sb))
-                        return XFS_ERROR(EINVAL);
-                /*
-                 * For IRIX 6.5, shared mounts must have the shared
-                 * version bit set, have the persistent readonly
-                 * field set, must be version 0 and can only be mounted
-                 * read-only.
-                 */
-                if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
-                     (mp->m_sb.sb_shared_vn != 0))
-                        return XFS_ERROR(EINVAL);
-                mp->m_flags |= XFS_MOUNT_SHARED;
-                /*
-                 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
-                 */
-                if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
-                        return XFS_ERROR(EINVAL);
-        }
-        if (ap->flags & XFSMNT_UQUOTA) {
-                mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_UQUOTAENF)
-                        mp->m_qflags |= XFS_UQUOTA_ENFD;
-        }
-        if (ap->flags & XFSMNT_GQUOTA) {
-                mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_GQUOTAENF)
-                        mp->m_qflags |= XFS_OQUOTA_ENFD;
-        } else if (ap->flags & XFSMNT_PQUOTA) {
-                mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                if (ap->flags & XFSMNT_PQUOTAENF)
-                        mp->m_qflags |= XFS_OQUOTA_ENFD;
-        }
        return 0;
 }
@@ -1667,19 +1396,14 @@ xfs_fs_fill_super(
 {
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
-        struct xfs_mount_args   *args;
        int                     flags = 0, error = ENOMEM;
+        char                    *mtpt = NULL;
-        args = xfs_args_allocate(sb, silent);
-        if (!args)
-                return -ENOMEM;
        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
        if (!mp)
-                goto out_free_args;
+                goto out;
        spin_lock_init(&mp->m_sb_lock);
-        mutex_init(&mp->m_ilock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
        INIT_LIST_HEAD(&mp->m_sync_list);
@@ -1689,12 +1413,9 @@ xfs_fs_fill_super(
        mp->m_super = sb;
        sb->s_fs_info = mp;
-        if (sb->s_flags & MS_RDONLY)
+        error = xfs_parseargs(mp, (char *)data, &mtpt);
-                mp->m_flags |= XFS_MOUNT_RDONLY;
-        error = xfs_parseargs(mp, (char *)data, args, 0);
        if (error)
-                goto out_free_mp;
+                goto out_free_fsname;
        sb_min_blocksize(sb, BBSIZE);
        sb->s_xattr = xfs_xattr_handlers;
@@ -1702,33 +1423,28 @@ xfs_fs_fill_super(
        sb->s_qcop = &xfs_quotactl_operations;
        sb->s_op = &xfs_super_operations;
-        error = xfs_dmops_get(mp, args);
+        error = xfs_dmops_get(mp);
        if (error)
-                goto out_free_mp;
+                goto out_free_fsname;
-        error = xfs_qmops_get(mp, args);
+        error = xfs_qmops_get(mp);
        if (error)
                goto out_put_dmops;
-        if (args->flags & XFSMNT_QUIET)
+        if (silent)
                flags |= XFS_MFSI_QUIET;
-        error = xfs_open_devices(mp, args);
+        error = xfs_open_devices(mp);
        if (error)
                goto out_put_qmops;
        if (xfs_icsb_init_counters(mp))
                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
-        /*
-         * Setup flags based on mount(2) options and then the superblock
-         */
-        error = xfs_start_flags(args, mp);
-        if (error)
-                goto out_free_fsname;
        error = xfs_readsb(mp, flags);
        if (error)
-                goto out_free_fsname;
+                goto out_destroy_counters;
-        error = xfs_finish_flags(args, mp);
+        error = xfs_finish_flags(mp);
        if (error)
                goto out_free_sb;
@@ -1747,7 +1463,7 @@ xfs_fs_fill_super(
        if (error)
                goto out_filestream_unmount;
-        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
+        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
        sb->s_dirt = 1;
        sb->s_magic = XFS_SB_MAGIC;
@@ -1772,35 +1488,31 @@ xfs_fs_fill_super(
                goto fail_vnrele;
        }
-        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        error = xfs_syncd_init(mp);
-        mp->m_sync_work.w_mount = mp;
+        if (error)
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
-        if (IS_ERR(mp->m_sync_task)) {
-                error = -PTR_ERR(mp->m_sync_task);
                goto fail_vnrele;
-        }
-        xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
+        kfree(mtpt);
-        kfree(args);
+        xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
        return 0;
 out_filestream_unmount:
        xfs_filestream_unmount(mp);
 out_free_sb:
        xfs_freesb(mp);
- out_free_fsname:
+ out_destroy_counters:
-        xfs_free_fsname(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
 out_put_qmops:
        xfs_qmops_put(mp);
 out_put_dmops:
        xfs_dmops_put(mp);
- out_free_mp:
+ out_free_fsname:
+        xfs_free_fsname(mp);
+        kfree(mtpt);
        kfree(mp);
- out_free_args:
+ out:
-        kfree(args);
        return -error;
 fail_vnrele:
@@ -1820,8 +1532,6 @@ xfs_fs_fill_super(
        xfs_filestream_unmount(mp);
        XFS_bflush(mp->m_ddev_targp);
-        error = xfs_unmount_flush(mp, 0);
-        WARN_ON(error);
        xfs_unmountfs(mp);
        goto out_free_sb;
@@ -1847,7 +1557,7 @@ static struct super_operations xfs_super_operations = {
        .put_super              = xfs_fs_put_super,
        .write_super            = xfs_fs_write_super,
        .sync_fs                = xfs_fs_sync_super,
-        .write_super_lockfs     = xfs_fs_lockfs,
+        .freeze_fs              = xfs_fs_freeze,
        .statfs                 = xfs_fs_statfs,
        .remount_fs             = xfs_fs_remount,
        .show_options           = xfs_fs_show_options,
@@ -1882,10 +1592,19 @@ xfs_alloc_trace_bufs(void)
        if (!xfs_bmap_trace_buf)
                goto out_free_alloc_trace;
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
+        xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
+                                             KM_MAYFAIL);
+        if (!xfs_allocbt_trace_buf)
+                goto out_free_bmap_trace;
+        xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
+        if (!xfs_inobt_trace_buf)
+                goto out_free_allocbt_trace;
        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
        if (!xfs_bmbt_trace_buf)
-                goto out_free_bmap_trace;
+                goto out_free_inobt_trace;
 #endif
 #ifdef XFS_ATTR_TRACE
        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
@@ -1907,8 +1626,12 @@ xfs_alloc_trace_bufs(void)
        ktrace_free(xfs_attr_trace_buf);
 out_free_bmbt_trace:
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+ out_free_inobt_trace:
+        ktrace_free(xfs_inobt_trace_buf);
+ out_free_allocbt_trace:
+        ktrace_free(xfs_allocbt_trace_buf);
 out_free_bmap_trace:
 #endif
 #ifdef XFS_BMAP_TRACE
@@ -1931,8 +1654,10 @@ xfs_free_trace_bufs(void)
 #ifdef XFS_ATTR_TRACE
        ktrace_free(xfs_attr_trace_buf);
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+        ktrace_free(xfs_inobt_trace_buf);
+        ktrace_free(xfs_allocbt_trace_buf);
 #endif
 #ifdef XFS_BMAP_TRACE
        ktrace_free(xfs_bmap_trace_buf);
@@ -1945,16 +1670,10 @@ xfs_free_trace_bufs(void)
 STATIC int __init
 xfs_init_zones(void)
 {
-        xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
-                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-                                        KM_ZONE_SPREAD,
-                                        xfs_fs_inode_init_once);
-        if (!xfs_vnode_zone)
-                goto out;
        xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
        if (!xfs_ioend_zone)
-                goto out_destroy_vnode_zone;
+                goto out;
        xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
                                                  xfs_ioend_zone);
@@ -1970,6 +1689,7 @@ xfs_init_zones(void)
                                                "xfs_bmap_free_item");
        if (!xfs_bmap_free_item_zone)
                goto out_destroy_log_ticket_zone;
        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
                                                "xfs_btree_cur");
        if (!xfs_btree_cur_zone)
@@ -2017,8 +1737,8 @@ xfs_init_zones(void)
        xfs_inode_zone =
                kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-                                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
+                        KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
-                                        KM_ZONE_SPREAD, NULL);
+                        xfs_fs_inode_init_once);
        if (!xfs_inode_zone)
                goto out_destroy_efi_zone;
@@ -2066,8 +1786,6 @@ xfs_init_zones(void)
        mempool_destroy(xfs_ioend_pool);
 out_destroy_ioend_zone:
        kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
-        kmem_zone_destroy(xfs_vnode_zone);
 out:
        return -ENOMEM;
 }
@@ -2092,7 +1810,6 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_log_ticket_zone);
        mempool_destroy(xfs_ioend_pool);
        kmem_zone_destroy(xfs_ioend_zone);
-        kmem_zone_destroy(xfs_vnode_zone);
 }
@@ -2100,13 +1817,12 @@ STATIC int __init
 init_xfs_fs(void)
 {
        int                     error;
-        static char             message[] __initdata = KERN_INFO \
-                XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
-        printk(message);
+        printk(KERN_INFO XFS_VERSION_STRING " with "
+                         XFS_BUILD_OPTIONS " enabled\n");
        ktrace_init(64);
-        vn_init();
+        xfs_ioend_init();
        xfs_dir_startup();
        error = xfs_init_zones();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index fe2ef4e6a0f9..d5d776d4cd67 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -20,24 +20,12 @@
 #include <linux/exportfs.h>
-#ifdef CONFIG_XFS_DMAPI
-# define vfs_insertdmapi(vfs)   vfs_insertops(vfsp, &xfs_dmops)
-# define vfs_initdmapi()        dmapi_init()
-# define vfs_exitdmapi()        dmapi_uninit()
-#else
-# define vfs_insertdmapi(vfs)   do { } while (0)
-# define vfs_initdmapi()        do { } while (0)
-# define vfs_exitdmapi()        do { } while (0)
-#endif
 #ifdef CONFIG_XFS_QUOTA
-# define vfs_insertquota(vfs)   vfs_insertops(vfsp, &xfs_qmops)
 extern void xfs_qm_init(void);
 extern void xfs_qm_exit(void);
 # define vfs_initquota()        xfs_qm_init()
 # define vfs_exitquota()        xfs_qm_exit()
 #else
-# define vfs_insertquota(vfs)   do { } while (0)
 # define vfs_initquota()        do { } while (0)
 # define vfs_exitquota()        do { } while (0)
 #endif
@@ -101,9 +89,6 @@ struct block_device;
 extern __uint64_t xfs_max_file_offset(unsigned int);
-extern void xfs_flush_inode(struct xfs_inode *);
-extern void xfs_flush_device(struct xfs_inode *);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern const struct export_operations xfs_export_operations;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
new file mode 100644
index 000000000000..2ed035354c26
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_mru_cache.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_utils.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_rw.h"
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+/*
+ * Sync all the inodes in the given AG according to the
+ * direction given by the flags.
+ */
+STATIC int
+xfs_sync_inodes_ag(
+        xfs_mount_t     *mp,
+        int             ag,
+        int             flags)
+{
+        xfs_perag_t     *pag = &mp->m_perag[ag];
+        int             nr_found;
+        uint32_t        first_index = 0;
+        int             error = 0;
+        int             last_error = 0;
+        int             fflag = XFS_B_ASYNC;
+        if (flags & SYNC_DELWRI)
+                fflag = XFS_B_DELWRI;
+        if (flags & SYNC_WAIT)
+                fflag = 0;              /* synchronous overrides all */
+        do {
+                struct inode    *inode;
+                xfs_inode_t     *ip = NULL;
+                int             lock_flags = XFS_ILOCK_SHARED;
+                /*
+                 * use a gang lookup to find the next inode in the tree
+                 * as the tree is sparse and a gang lookup walks to find
+                 * the number of objects requested.
+                 */
+                read_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                                (void**)&ip, first_index, 1);
+                if (!nr_found) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /*
+                 * Update the index for the next lookup. Catch overflows
+                 * into the next AG range which can occur if we have inodes
+                 * in the last block of the AG and we are currently
+                 * pointing to the last inode.
+                 */
+                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /* nothing to sync during shutdown */
+                if (XFS_FORCED_SHUTDOWN(mp)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        return 0;
+                }
+                /*
+                 * If we can't get a reference on the inode, it must be
+                 * in reclaim. Leave it for the reclaim code to flush.
+                 */
+                inode = VFS_I(ip);
+                if (!igrab(inode)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        continue;
+                }
+                read_unlock(&pag->pag_ici_lock);
+                /* avoid new or bad inodes */
+                if (is_bad_inode(inode) ||
+                    xfs_iflags_test(ip, XFS_INEW)) {
+                        IRELE(ip);
+                        continue;
+                }
+                /*
+                 * If we have to flush data or wait for I/O completion
+                 * we need to hold the iolock.
+                 */
+                if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
+                        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                        lock_flags |= XFS_IOLOCK_SHARED;
+                        error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
+                        if (flags & SYNC_IOWAIT)
+                                xfs_ioend_wait(ip);
+                }
+                xfs_ilock(ip, XFS_ILOCK_SHARED);
+                if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
+                        if (flags & SYNC_WAIT) {
+                                xfs_iflock(ip);
+                                if (!xfs_inode_clean(ip))
+                                        error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+                                else
+                                        xfs_ifunlock(ip);
+                        } else if (xfs_iflock_nowait(ip)) {
+                                if (!xfs_inode_clean(ip))
+                                        error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
+                                else
+                                        xfs_ifunlock(ip);
+                        }
+                }
+                xfs_iput(ip, lock_flags);
+                if (error)
+                        last_error = error;
+                /*
+                 * bail out if the filesystem is corrupted.
+                 */
+                if (error == EFSCORRUPTED)
+                        return XFS_ERROR(error);
+        } while (nr_found);
+        return last_error;
+}
+int
+xfs_sync_inodes(
+        xfs_mount_t     *mp,
+        int             flags)
+{
+        int             error;
+        int             last_error;
+        int             i;
+        int             lflags = XFS_LOG_FORCE;
+        if (mp->m_flags & XFS_MOUNT_RDONLY)
+                return 0;
+        error = 0;
+        last_error = 0;
+        if (flags & SYNC_WAIT)
+                lflags |= XFS_LOG_SYNC;
+        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                if (!mp->m_perag[i].pag_ici_init)
+                        continue;
+                error = xfs_sync_inodes_ag(mp, i, flags);
+                if (error)
+                        last_error = error;
+                if (error == EFSCORRUPTED)
+                        break;
+        }
+        if (flags & SYNC_DELWRI)
+                xfs_log_force(mp, 0, lflags);
+        return XFS_ERROR(last_error);
+}
+STATIC int
+xfs_commit_dummy_trans(
+        struct xfs_mount        *mp,
+        uint                    log_flags)
+{
+        struct xfs_inode        *ip = mp->m_rootip;
+        struct xfs_trans        *tp;
+        int                     error;
+        /*
+         * Put a dummy transaction in the log to tell recovery
+         * that all others are OK.
+         */
+        tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_ihold(tp, ip);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        /* XXX(hch): ignoring the error here.. */
+        error = xfs_trans_commit(tp, 0);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_log_force(mp, 0, log_flags);
+        return 0;
+}
+int
+xfs_sync_fsdata(
+        struct xfs_mount        *mp,
+        int                     flags)
+{
+        struct xfs_buf          *bp;
+        struct xfs_buf_log_item *bip;
+        int                     error = 0;
+        /*
+         * If this is xfssyncd() then only sync the superblock if we can
+         * lock it without sleeping and it is not pinned.
+         */
+        if (flags & SYNC_BDFLUSH) {
+                ASSERT(!(flags & SYNC_WAIT));
+                bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+                if (!bp)
+                        goto out;
+                bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
+                if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
+                        goto out_brelse;
+        } else {
+                bp = xfs_getsb(mp, 0);
+                /*
+                 * If the buffer is pinned then push on the log so we won't
+                 * get stuck waiting in the write for someone, maybe
+                 * ourselves, to flush the log.
+                 *
+                 * Even though we just pushed the log above, we did not have
+                 * the superblock buffer locked at that point so it can
+                 * become pinned in between there and here.
+                 */
+                if (XFS_BUF_ISPINNED(bp))
+                        xfs_log_force(mp, 0, XFS_LOG_FORCE);
+        }
+        if (flags & SYNC_WAIT)
+                XFS_BUF_UNASYNC(bp);
+        else
+                XFS_BUF_ASYNC(bp);
+        return xfs_bwrite(mp, bp);
+ out_brelse:
+        xfs_buf_relse(bp);
+ out:
+        return error;
+}
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete.  Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+        struct xfs_mount        *mp)
+{
+        int error;
+        /* push non-blocking */
+        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+        XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+        xfs_filestream_flush(mp);
+        /* push and block */
+        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+        XFS_QM_DQSYNC(mp, SYNC_WAIT);
+        /* write superblock and hoover up shutdown errors */
+        error = xfs_sync_fsdata(mp, 0);
+        /* flush data-only devices */
+        if (mp->m_rtdev_targp)
+                XFS_bflush(mp->m_rtdev_targp);
+        return error;
+}
+STATIC void
+xfs_quiesce_fs(
+        struct xfs_mount        *mp)
+{
+        int     count = 0, pincount;
+        xfs_flush_buftarg(mp->m_ddev_targp, 0);
+        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+        /*
+         * This loop must run at least twice.  The first instance of the loop
+         * will flush most meta data but that will generate more meta data
+         * (typically directory updates).  Which then must be flushed and
+         * logged before we can write the unmount record.
+         */
+        do {
+                xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+                if (!pincount) {
+                        delay(50);
+                        count++;
+                }
+        } while (count < 2);
+}
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceding.
+ */
+void
+xfs_quiesce_attr(
+        struct xfs_mount        *mp)
+{
+        int     error = 0;
+        /* wait for all modifications to complete */
+        while (atomic_read(&mp->m_active_trans) > 0)
+                delay(100);
+        /* flush inodes and push all remaining buffers out to disk */
+        xfs_quiesce_fs(mp);
+        ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+        /* Push the superblock and write an unmount record */
+        error = xfs_log_sbcount(mp, 1);
+        if (error)
+                xfs_fs_cmn_err(CE_WARN, mp,
+                                "xfs_attr_quiesce: failed to log sb changes. "
+                                "Frozen image may not be consistent.");
+        xfs_log_unmount_write(mp);
+        xfs_unmountfs_writesb(mp);
+}
+/*
+ * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+ * Doing this has two advantages:
+ * - It saves on stack space, which is tight in certain situations
+ * - It can be used (with care) as a mechanism to avoid deadlocks.
+ * Flushing while allocating in a full filesystem requires both.
+ */
+STATIC void
+xfs_syncd_queue_work(
+        struct xfs_mount *mp,
+        void            *data,
+        void            (*syncer)(struct xfs_mount *, void *))
+{
+        struct bhv_vfs_sync_work *work;
+        work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+        INIT_LIST_HEAD(&work->w_list);
+        work->w_syncer = syncer;
+        work->w_data = data;
+        work->w_mount = mp;
+        spin_lock(&mp->m_sync_lock);
+        list_add_tail(&work->w_list, &mp->m_sync_list);
+        spin_unlock(&mp->m_sync_lock);
+        wake_up_process(mp->m_sync_task);
+}
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room...
+ */
+STATIC void
+xfs_flush_inode_work(
+        struct xfs_mount *mp,
+        void            *arg)
+{
+        struct inode    *inode = arg;
+        filemap_flush(inode->i_mapping);
+        iput(inode);
+}
+void
+xfs_flush_inode(
+        xfs_inode_t     *ip)
+{
+        struct inode    *inode = VFS_I(ip);
+        igrab(inode);
+        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
+        delay(msecs_to_jiffies(500));
+}
+/*
+ * This is the "bigger hammer" version of xfs_flush_inode_work...
+ * (IOW, "If at first you don't succeed, use a Bigger Hammer").
+ */
+STATIC void
+xfs_flush_device_work(
+        struct xfs_mount *mp,
+        void            *arg)
+{
+        struct inode    *inode = arg;
+        sync_blockdev(mp->m_super->s_bdev);
+        iput(inode);
+}
+void
+xfs_flush_device(
+        xfs_inode_t     *ip)
+{
+        struct inode    *inode = VFS_I(ip);
+        igrab(inode);
+        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+        delay(msecs_to_jiffies(500));
+        xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+}
+/*
+ * Every sync period we need to unpin all items, reclaim inodes, sync
+ * quota and write out the superblock. We might need to cover the log
+ * to indicate it is idle.
+ */
+STATIC void
+xfs_sync_worker(
+        struct xfs_mount *mp,
+        void            *unused)
+{
+        int             error;
+        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+                xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                /* dgc: errors ignored here */
+                error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+                error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+                if (xfs_log_need_covered(mp))
+                        error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
+        }
+        mp->m_sync_seq++;
+        wake_up(&mp->m_wait_single_sync_task);
+}
+STATIC int
+xfssyncd(
+        void                    *arg)
+{
+        struct xfs_mount        *mp = arg;
+        long                    timeleft;
+        bhv_vfs_sync_work_t     *work, *n;
+        LIST_HEAD               (tmp);
+        set_freezable();
+        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
+        for (;;) {
+                timeleft = schedule_timeout_interruptible(timeleft);
+                /* swsusp */
+                try_to_freeze();
+                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
+                        break;
+                spin_lock(&mp->m_sync_lock);
+                /*
+                 * We can get woken by laptop mode, to do a sync -
+                 * that's the (only!) case where the list would be
+                 * empty with time remaining.
+                 */
+                if (!timeleft || list_empty(&mp->m_sync_list)) {
+                        if (!timeleft)
+                                timeleft = xfs_syncd_centisecs *
+                                                        msecs_to_jiffies(10);
+                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
+                        list_add_tail(&mp->m_sync_work.w_list,
+                                        &mp->m_sync_list);
+                }
+                list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
+                        list_move(&work->w_list, &tmp);
+                spin_unlock(&mp->m_sync_lock);
+                list_for_each_entry_safe(work, n, &tmp, w_list) {
+                        (*work->w_syncer)(mp, work->w_data);
+                        list_del(&work->w_list);
+                        if (work == &mp->m_sync_work)
+                                continue;
+                        kmem_free(work);
+                }
+        }
+        return 0;
+}
+int
+xfs_syncd_init(
+        struct xfs_mount        *mp)
+{
+        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        mp->m_sync_work.w_mount = mp;
+        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+        if (IS_ERR(mp->m_sync_task))
+                return -PTR_ERR(mp->m_sync_task);
+        return 0;
+}
+void
+xfs_syncd_stop(
+        struct xfs_mount        *mp)
+{
+        kthread_stop(mp->m_sync_task);
+}
+int
+xfs_reclaim_inode(
+        xfs_inode_t     *ip,
+        int             locked,
+        int             sync_mode)
+{
+        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+        /* The hash lock here protects a thread in xfs_iget_core from
+         * racing with us on linking the inode back with a vnode.
+         * Once we have the XFS_IRECLAIM flag set it will not touch
+         * us.
+         */
+        write_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+            !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+                spin_unlock(&ip->i_flags_lock);
+                write_unlock(&pag->pag_ici_lock);
+                if (locked) {
+                        xfs_ifunlock(ip);
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                }
+                return 1;
+        }
+        __xfs_iflags_set(ip, XFS_IRECLAIM);
+        spin_unlock(&ip->i_flags_lock);
+        write_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(ip->i_mount, pag);
+        /*
+         * If the inode is still dirty, then flush it out.  If the inode
+         * is not in the AIL, then it will be OK to flush it delwri as
+         * long as xfs_iflush() does not keep any references to the inode.
+         * We leave that decision up to xfs_iflush() since it has the
+         * knowledge of whether it's OK to simply do a delwri flush of
+         * the inode or whether we need to wait until the inode is
+         * pulled from the AIL.
+         * We get the flush lock regardless, though, just to make sure
+         * we don't free it while it is being flushed.
+         */
+        if (!locked) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                xfs_iflock(ip);
+        }
+        /*
+         * In the case of a forced shutdown we rely on xfs_iflush() to
+         * wait for the inode to be unpinned before returning an error.
+         */
+        if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+                /* synchronize with xfs_iflush_done */
+                xfs_iflock(ip);
+                xfs_ifunlock(ip);
+        }
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_ireclaim(ip);
+        return 0;
+}
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+        read_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        radix_tree_tag_set(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+        spin_unlock(&ip->i_flags_lock);
+        read_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(mp, pag);
+}
+void
+__xfs_inode_clear_reclaim_tag(
+        xfs_mount_t     *mp,
+        xfs_perag_t     *pag,
+        xfs_inode_t     *ip)
+{
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+}
+void
+xfs_inode_clear_reclaim_tag(
+        xfs_inode_t     *ip)
+{
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+        read_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+        spin_unlock(&ip->i_flags_lock);
+        read_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(mp, pag);
+}
+STATIC void
+xfs_reclaim_inodes_ag(
+        xfs_mount_t     *mp,
+        int             ag,
+        int             noblock,
+        int             mode)
+{
+        xfs_inode_t     *ip = NULL;
+        xfs_perag_t     *pag = &mp->m_perag[ag];
+        int             nr_found;
+        uint32_t        first_index;
+        int             skipped;
+restart:
+        first_index = 0;
+        skipped = 0;
+        do {
+                /*
+                 * use a gang lookup to find the next inode in the tree
+                 * as the tree is sparse and a gang lookup walks to find
+                 * the number of objects requested.
+                 */
+                read_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+                                        (void**)&ip, first_index, 1,
+                                        XFS_ICI_RECLAIM_TAG);
+                if (!nr_found) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /*
+                 * Update the index for the next lookup. Catch overflows
+                 * into the next AG range which can occur if we have inodes
+                 * in the last block of the AG and we are currently
+                 * pointing to the last inode.
+                 */
+                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /* ignore if already under reclaim */
+                if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        continue;
+                }
+                if (noblock) {
+                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                                read_unlock(&pag->pag_ici_lock);
+                                continue;
+                        }
+                        if (xfs_ipincount(ip) ||
+                            !xfs_iflock_nowait(ip)) {
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                                read_unlock(&pag->pag_ici_lock);
+                                continue;
+                        }
+                }
+                read_unlock(&pag->pag_ici_lock);
+                /*
+                 * hmmm - this is an inode already in reclaim. Do
+                 * we even bother catching it here?
+                 */
+                if (xfs_reclaim_inode(ip, noblock, mode))
+                        skipped++;
+        } while (nr_found);
+        if (skipped) {
+                delay(1);
+                goto restart;
+        }
+        return;
+}
+int
+xfs_reclaim_inodes(
+        xfs_mount_t     *mp,
+        int              noblock,
+        int             mode)
+{
+        int             i;
+        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                if (!mp->m_perag[i].pag_ici_init)
+                        continue;
+                xfs_reclaim_inodes_ag(mp, i, noblock, mode);
+        }
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
new file mode 100644
index 000000000000..5f6de1efe1f6
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+struct xfs_mount;
+typedef struct bhv_vfs_sync_work {
+        struct list_head        w_list;
+        struct xfs_mount        *w_mount;
+        void                    *w_data;        /* syncer routine argument */
+        void                    (*w_syncer)(struct xfs_mount *, void *);
+} bhv_vfs_sync_work_t;
+#define SYNC_ATTR               0x0001  /* sync attributes */
+#define SYNC_DELWRI             0x0002  /* look at delayed writes */
+#define SYNC_WAIT               0x0004  /* wait for i/o to complete */
+#define SYNC_BDFLUSH            0x0008  /* BDFLUSH is calling -- don't block */
+#define SYNC_IOWAIT             0x0010  /* wait for all I/O to complete */
+int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_syncd_stop(struct xfs_mount *mp);
+int xfs_sync_inodes(struct xfs_mount *mp, int flags);
+int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
+int xfs_quiesce_data(struct xfs_mount *mp);
+void xfs_quiesce_attr(struct xfs_mount *mp);
+void xfs_flush_inode(struct xfs_inode *ip);
+void xfs_flush_device(struct xfs_inode *ip);
+int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
+int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+                                struct xfs_inode *ip);
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7dacb5bbde3f..916c0ffb6083 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -56,17 +56,6 @@ xfs_stats_clear_proc_handler(
 static ctl_table xfs_table[] = {
        {
-                .ctl_name       = XFS_RESTRICT_CHOWN,
-                .procname       = "restrict_chown",
-                .data           = &xfs_params.restrict_chown.val,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec_minmax,
-                .strategy       = &sysctl_intvec,
-                .extra1         = &xfs_params.restrict_chown.min,
-                .extra2         = &xfs_params.restrict_chown.max
-        },
-        {
                .ctl_name       = XFS_SGID_INHERIT,
                .procname       = "irix_sgid_inherit",
                .data           = &xfs_params.sgid_inherit.val,
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index 4aadb8056c37..b9937d450f8e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -31,7 +31,6 @@ typedef struct xfs_sysctl_val {
 } xfs_sysctl_val_t;
 typedef struct xfs_param {
-        xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/
        xfs_sysctl_val_t sgid_inherit;  /* Inherit S_ISGID if process' GID is
                                         * not a member of parent dir GID. */
        xfs_sysctl_val_t symlink_mode;  /* Link creat mode affected by umask */
@@ -68,7 +67,7 @@ typedef struct xfs_param {
 enum {
        /* XFS_REFCACHE_SIZE = 1 */
        /* XFS_REFCACHE_PURGE = 2 */
-        XFS_RESTRICT_CHOWN = 3,
+        /* XFS_RESTRICT_CHOWN = 3 */
        XFS_SGID_INHERIT = 4,
        XFS_SYMLINK_MODE = 5,
        XFS_PANIC_MASK = 6,
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
deleted file mode 100644
index 7e60c7776b1c..000000000000
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VFS_H__
-#define __XFS_VFS_H__
-#include <linux/vfs.h>
-#include "xfs_fs.h"
-struct inode;
-struct fid;
-struct cred;
-struct seq_file;
-struct super_block;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_mount_args;
-typedef struct kstatfs  bhv_statvfs_t;
-typedef struct bhv_vfs_sync_work {
-        struct list_head        w_list;
-        struct xfs_mount        *w_mount;
-        void                    *w_data;        /* syncer routine argument */
-        void                    (*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
-#define SYNC_ATTR               0x0001  /* sync attributes */
-#define SYNC_CLOSE              0x0002  /* close file system down */
-#define SYNC_DELWRI             0x0004  /* look at delayed writes */
-#define SYNC_WAIT               0x0008  /* wait for i/o to complete */
-#define SYNC_BDFLUSH            0x0010  /* BDFLUSH is calling -- don't block */
-#define SYNC_FSDATA             0x0020  /* flush fs data (e.g. superblocks) */
-#define SYNC_REFCACHE           0x0040  /* prune some of the nfs ref cache */
-#define SYNC_REMOUNT            0x0080  /* remount readonly, no dummy LRs */
-#define SYNC_IOWAIT             0x0100  /* wait for all I/O to complete */
-/*
- * When remounting a filesystem read-only or freezing the filesystem,
- * we have two phases to execute. This first phase is syncing the data
- * before we quiesce the fielsystem, and the second is flushing all the
- * inodes out after we've waited for all the transactions created by
- * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE
- * to ensure that the inodes are written to their location on disk
- * rather than just existing in transactions in the log. This means
- * after a quiesce there is no log replay required to write the inodes
- * to disk (this is the main difference between a sync and a quiesce).
- */
-#define SYNC_DATA_QUIESCE       (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT)
-#define SYNC_INODE_QUIESCE      (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
-#define SHUTDOWN_META_IO_ERROR  0x0001  /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR   0x0002  /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT   0x0004  /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE 0x0008  /* corrupt in-memory data structures */
-#define SHUTDOWN_REMOTE_REQ     0x0010  /* shutdown came from remote cell */
-#define SHUTDOWN_DEVICE_REQ     0x0020  /* failed all paths to the device */
-#define xfs_test_for_freeze(mp)         ((mp)->m_super->s_frozen)
-#define xfs_wait_for_freeze(mp,l)       vfs_check_frozen((mp)->m_super, (l))
-#endif  /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
deleted file mode 100644
index b52528bbbfff..000000000000
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-/*
- * And this gunk is needed for xfs_mount.h"
- */
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dmapi.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-/*
- * Dedicated vnode inactive/reclaim sync wait queues.
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC                  37
-#define vptosync(v)             (&vsync[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t vsync[NVSYNC];
-void __init
-vn_init(void)
-{
-        int i;
-        for (i = 0; i < NVSYNC; i++)
-                init_waitqueue_head(&vsync[i]);
-}
-void
-vn_iowait(
-        xfs_inode_t     *ip)
-{
-        wait_queue_head_t *wq = vptosync(ip);
-        wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-void
-vn_iowake(
-        xfs_inode_t     *ip)
-{
-        if (atomic_dec_and_test(&ip->i_iocount))
-                wake_up(vptosync(ip));
-}
-/*
- * Volume managers supporting multiple paths can send back ENODEV when the
- * final path disappears.  In this case continuing to fill the page cache
- * with dirty data which cannot be written out is evil, so prevent that.
- */
-void
-vn_ioerror(
-        xfs_inode_t     *ip,
-        int             error,
-        char            *f,
-        int             l)
-{
-        if (unlikely(error == -ENODEV))
-                xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
-}
-#ifdef  XFS_INODE_TRACE
-/*
- * Reference count of Linux inode if present, -1 if the xfs_inode
- * has no associated Linux inode.
- */
-static inline int xfs_icount(struct xfs_inode *ip)
-{
-        struct inode *vp = VFS_I(ip);
-        if (vp)
-                return vn_count(vp);
-        return -1;
-}
-#define KTRACE_ENTER(ip, vk, s, line, ra)                       \
-        ktrace_enter(   (ip)->i_trace,                          \
-/*  0 */                (void *)(__psint_t)(vk),                \
-/*  1 */                (void *)(s),                            \
-/*  2 */                (void *)(__psint_t) line,               \
-/*  3 */                (void *)(__psint_t)xfs_icount(ip),      \
-/*  4 */                (void *)(ra),                           \
-/*  5 */                NULL,                                   \
-/*  6 */                (void *)(__psint_t)current_cpu(),       \
-/*  7 */                (void *)(__psint_t)current_pid(),       \
-/*  8 */                (void *)__return_address,               \
-/*  9 */                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-/*
- * Vnode tracing code.
- */
-void
-_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
-}
-void
-_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
-}
-void
-xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
-}
-void
-_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
-}
-void
-xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
-        KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
-}
-#endif  /* XFS_INODE_TRACE */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 683ce16210ff..f65983a230d3 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -18,7 +18,10 @@
 #ifndef __XFS_VNODE_H__
 #define __XFS_VNODE_H__
+#include "xfs_fs.h"
 struct file;
+struct xfs_inode;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
@@ -51,40 +54,6 @@ struct attrlist_cursor_kern;
                                           Prevent VM access to the pages until
                                           the operation completes. */
-extern void     vn_init(void);
-/*
- * Yeah, these don't take vnode anymore at all, all this should be
- * cleaned up at some point.
- */
-extern void     vn_iowait(struct xfs_inode *ip);
-extern void     vn_iowake(struct xfs_inode *ip);
-extern void     vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
-static inline int vn_count(struct inode *vp)
-{
-        return atomic_read(&vp->i_count);
-}
-#define IHOLD(ip) \
-do { \
-        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
-        atomic_inc(&(VFS_I(ip)->i_count)); \
-        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-} while (0)
-#define IRELE(ip) \
-do { \
-        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-        iput(VFS_I(ip)); \
-} while (0)
-static inline struct inode *vn_grab(struct inode *vp)
-{
-        return igrab(vp);
-}
 /*
 * Dealing with bad inodes
 */
@@ -121,39 +90,4 @@ static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
                                        PAGECACHE_TAG_DIRTY)
-/*
- * Tracking vnode activity.
- */
-#if defined(XFS_INODE_TRACE)
-#define INODE_TRACE_SIZE        16              /* number of trace entries */
-#define INODE_KTRACE_ENTRY      1
-#define INODE_KTRACE_EXIT       2
-#define INODE_KTRACE_HOLD       3
-#define INODE_KTRACE_REF        4
-#define INODE_KTRACE_RELE       5
-extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
-extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
-extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
-extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
-extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
-#define xfs_itrace_entry(ip)    \
-        _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit(ip)     \
-        _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit_tag(ip, tag)    \
-        _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
-#define xfs_itrace_ref(ip)      \
-        _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
-#else
-#define xfs_itrace_entry(a)
-#define xfs_itrace_exit(a)
-#define xfs_itrace_exit_tag(a, b)
-#define xfs_itrace_hold(a, b, c, d)
-#define xfs_itrace_ref(a)
-#define xfs_itrace_rele(a, b, c, d)
-#endif
 #endif  /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index f2705f2fd43c..591ca6602bfb 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
        if (brandnewdquot) {
                dqp->dq_flnext = dqp->dq_flprev = dqp;
                mutex_init(&dqp->q_qlock);
-                sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+                init_waitqueue_head(&dqp->q_pinwait);
                /*
                 * Because we want to use a counting completion, complete
@@ -131,7 +131,7 @@ xfs_qm_dqinit(
                 dqp->q_res_bcount = 0;
                 dqp->q_res_icount = 0;
                 dqp->q_res_rtbcount = 0;
-                 dqp->q_pincount = 0;
+                 atomic_set(&dqp->q_pincount, 0);
                 dqp->q_hash = NULL;
                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
@@ -1221,16 +1221,14 @@ xfs_qm_dqflush(
        xfs_dqtrace_entry(dqp, "DQFLUSH");
        /*
-         * If not dirty, nada.
+         * If not dirty, or it's pinned and we are not supposed to
+         * block, nada.
         */
-        if (!XFS_DQ_IS_DIRTY(dqp)) {
+        if (!XFS_DQ_IS_DIRTY(dqp) ||
+            (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
                xfs_dqfunlock(dqp);
-                return (0);
+                return 0;
        }
-        /*
-         * Cant flush a pinned dquot. Wait for it.
-         */
        xfs_qm_dqunpin_wait(dqp);
        /*
@@ -1274,10 +1272,8 @@ xfs_qm_dqflush(
        dqp->dq_flags &= ~(XFS_DQ_DIRTY);
        mp = dqp->q_mount;
-        /* lsn is 64 bits */
+        xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
-        spin_lock(&mp->m_ail_lock);
+                                        &dqp->q_logitem.qli_item.li_lsn);
-        dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
-        spin_unlock(&mp->m_ail_lock);
        /*
         * Attach an iodone routine so that we can remove this dquot from the
@@ -1323,8 +1319,10 @@ xfs_qm_dqflush_done(
        xfs_dq_logitem_t        *qip)
 {
        xfs_dquot_t             *dqp;
+        struct xfs_ail          *ailp;
        dqp = qip->qli_dquot;
+        ailp = qip->qli_item.li_ailp;
        /*
         * We only want to pull the item from the AIL if its
@@ -1337,15 +1335,12 @@ xfs_qm_dqflush_done(
        if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
            qip->qli_item.li_lsn == qip->qli_flush_lsn) {
-                spin_lock(&dqp->q_mount->m_ail_lock);
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                /*
+                spin_lock(&ailp->xa_lock);
-                 * xfs_trans_delete_ail() drops the AIL lock.
-                 */
                if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
-                        xfs_trans_delete_ail(dqp->q_mount,
+                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
-                                             (xfs_log_item_t*)qip);
                else
-                        spin_unlock(&dqp->q_mount->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
        }
        /*
@@ -1375,7 +1370,7 @@ xfs_dqunlock(
        mutex_unlock(&(dqp->q_qlock));
        if (dqp->q_logitem.qli_dquot == dqp) {
                /* Once was dqp->q_mount, but might just have been cleared */
-                xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
+                xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
                                        (xfs_log_item_t*)&(dqp->q_logitem));
        }
 }
@@ -1489,7 +1484,7 @@ xfs_qm_dqpurge(
                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
                xfs_dqflock(dqp);
        }
-        ASSERT(dqp->q_pincount == 0);
+        ASSERT(atomic_read(&dqp->q_pincount) == 0);
        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 8958d0faf8d3..7e455337e2ba 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -83,8 +83,8 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
        mutex_t          q_qlock;       /* quota lock */
        struct completion q_flush;      /* flush completion queue */
-        uint             q_pincount;    /* pin count for this dquot */
+        atomic_t          q_pincount;   /* dquot pin count */
-        sv_t             q_pinwait;     /* sync var for pinning */
+        wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
 #ifdef XFS_DQUOT_TRACE
        struct ktrace   *q_trace;       /* trace header structure */
 #endif
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index f028644caa5e..1728f6a7c4f5 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -88,25 +88,22 @@ xfs_qm_dquot_logitem_format(
 /*
 * Increment the pin count of the given dquot.
- * This value is protected by pinlock spinlock in the xQM structure.
 */
 STATIC void
 xfs_qm_dquot_logitem_pin(
        xfs_dq_logitem_t *logitem)
 {
-        xfs_dquot_t *dqp;
+        xfs_dquot_t *dqp = logitem->qli_dquot;
-        dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+        atomic_inc(&dqp->q_pincount);
-        dqp->q_pincount++;
-        spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
 }
 /*
 * Decrement the pin count of the given dquot, and wake up
 * anyone in xfs_dqwait_unpin() if the count goes to 0.  The
- * dquot must have been previously pinned with a call to xfs_dqpin().
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
 */
 /* ARGSUSED */
 STATIC void
@@ -114,16 +111,11 @@ xfs_qm_dquot_logitem_unpin(
        xfs_dq_logitem_t *logitem,
        int               stale)
 {
-        xfs_dquot_t *dqp;
+        xfs_dquot_t *dqp = logitem->qli_dquot;
-        dqp = logitem->qli_dquot;
+        ASSERT(atomic_read(&dqp->q_pincount) > 0);
-        ASSERT(dqp->q_pincount > 0);
+        if (atomic_dec_and_test(&dqp->q_pincount))
-        spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+                wake_up(&dqp->q_pinwait);
-        dqp->q_pincount--;
-        if (dqp->q_pincount == 0) {
-                sv_broadcast(&dqp->q_pinwait);
-        }
-        spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
 }
 /* ARGSUSED */
@@ -193,21 +185,14 @@ xfs_qm_dqunpin_wait(
        xfs_dquot_t     *dqp)
 {
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-        if (dqp->q_pincount == 0) {
+        if (atomic_read(&dqp->q_pincount) == 0)
                return;
-        }
        /*
         * Give the log a push so we don't wait here too long.
         */
        xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
-        spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+        wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
-        if (dqp->q_pincount == 0) {
-                spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-                return;
-        }
-        sv_wait(&(dqp->q_pinwait), PINOD,
-                &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
 }
 /*
@@ -310,7 +295,7 @@ xfs_qm_dquot_logitem_trylock(
        uint                    retval;
        dqp = qip->qli_dquot;
-        if (dqp->q_pincount > 0)
+        if (atomic_read(&dqp->q_pincount) > 0)
                return (XFS_ITEM_PINNED);
        if (! xfs_qm_dqlock_nowait(dqp))
@@ -568,14 +553,16 @@ xfs_qm_qoffend_logitem_committed(
        xfs_lsn_t lsn)
 {
        xfs_qoff_logitem_t      *qfs;
+        struct xfs_ail          *ailp;
        qfs = qfe->qql_start_lip;
-        spin_lock(&qfs->qql_item.li_mountp->m_ail_lock);
+        ailp = qfs->qql_item.li_ailp;
+        spin_lock(&ailp->xa_lock);
        /*
         * Delete the qoff-start logitem from the AIL.
-         * xfs_trans_delete_ail() drops the AIL lock.
+         * xfs_trans_ail_delete() drops the AIL lock.
         */
-        xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs);
+        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
        kmem_free(qfs);
        kmem_free(qfe);
        return (xfs_lsn_t)-1;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index df0ffef9775a..6b13960cf318 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -396,13 +395,10 @@ xfs_qm_mount_quotas(
 /*
 * Called from the vfsops layer.
 */
-int
+void
 xfs_qm_unmount_quotas(
        xfs_mount_t     *mp)
 {
-        xfs_inode_t     *uqp, *gqp;
-        int             error = 0;
        /*
         * Release the dquots that root inode, et al might be holding,
         * before we flush quotas and blow away the quotainfo structure.
@@ -415,43 +411,18 @@ xfs_qm_unmount_quotas(
                xfs_qm_dqdetach(mp->m_rsumip);
        /*
-         * Flush out the quota inodes.
+         * Release the quota inodes.
         */
-        uqp = gqp = NULL;
        if (mp->m_quotainfo) {
-                if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
+                if (mp->m_quotainfo->qi_uquotaip) {
-                        xfs_ilock(uqp, XFS_ILOCK_EXCL);
+                        IRELE(mp->m_quotainfo->qi_uquotaip);
-                        xfs_iflock(uqp);
+                        mp->m_quotainfo->qi_uquotaip = NULL;
-                        error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
-                        xfs_iunlock(uqp, XFS_ILOCK_EXCL);
-                        if (unlikely(error == EFSCORRUPTED)) {
-                                XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                goto out;
-                        }
                }
-                if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
+                if (mp->m_quotainfo->qi_gquotaip) {
-                        xfs_ilock(gqp, XFS_ILOCK_EXCL);
+                        IRELE(mp->m_quotainfo->qi_gquotaip);
-                        xfs_iflock(gqp);
+                        mp->m_quotainfo->qi_gquotaip = NULL;
-                        error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
-                        xfs_iunlock(gqp, XFS_ILOCK_EXCL);
-                        if (unlikely(error == EFSCORRUPTED)) {
-                                XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
-                                                 XFS_ERRLEVEL_LOW, mp);
-                                goto out;
-                        }
                }
        }
-        if (uqp) {
-                 IRELE(uqp);
-                 mp->m_quotainfo->qi_uquotaip = NULL;
-        }
-        if (gqp) {
-                IRELE(gqp);
-                mp->m_quotainfo->qi_gquotaip = NULL;
-        }
-out:
-        return XFS_ERROR(error);
 }
 /*
@@ -987,14 +958,10 @@ xfs_qm_dqdetach(
 }
 /*
- * This is called by VFS_SYNC and flags arg determines the caller,
+ * This is called to sync quotas. We can be told to use non-blocking
- * and its motives, as done in xfs_sync.
+ * semantics by either the SYNC_BDFLUSH flag or the absence of the
- *
+ * SYNC_WAIT flag.
- * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
- * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
- * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
 */
 int
 xfs_qm_sync(
        xfs_mount_t     *mp,
@@ -1137,7 +1104,6 @@ xfs_qm_init_quotainfo(
                return error;
        }
-        spin_lock_init(&qinf->qi_pinlock);
        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
        qinf->qi_dqreclaims = 0;
@@ -1234,7 +1200,6 @@ xfs_qm_destroy_quotainfo(
         */
        xfs_qm_rele_quotafs_ref(mp);
-        spinlock_destroy(&qi->qi_pinlock);
        xfs_qm_list_destroy(&qi->qi_dqlist);
        if (qi->qi_uquotaip) {
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 44f25349e478..ddf09166387c 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -106,7 +106,6 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-        spinlock_t       qi_pinlock;     /* dquot pinning lock */
        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
        int              qi_dqreclaims;  /* a change here indicates
                                            a removal in the dqlist */
@@ -168,7 +167,7 @@ extern void		xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern void             xfs_qm_mount_quotas(xfs_mount_t *);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
 extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern int              xfs_qm_unmount_quotas(xfs_mount_t *);
+extern void             xfs_qm_unmount_quotas(xfs_mount_t *);
 extern int              xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
 extern int              xfs_qm_sync(xfs_mount_t *, int);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index eea2e60b456b..bc6c5cca3e12 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -51,7 +50,7 @@
 STATIC void
 xfs_fill_statvfs_from_dquot(
-        bhv_statvfs_t           *statp,
+        struct kstatfs          *statp,
        xfs_disk_dquot_t        *dp)
 {
        __uint64_t              limit;
@@ -88,7 +87,7 @@ xfs_fill_statvfs_from_dquot(
 STATIC void
 xfs_qm_statvfs(
        xfs_inode_t             *ip,
-        bhv_statvfs_t           *statp)
+        struct kstatfs          *statp)
 {
        xfs_mount_t             *mp = ip->i_mount;
        xfs_dquot_t             *dqp;
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 1a3b803dfa55..68139b38aede 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -127,7 +127,7 @@ xfs_qm_quotactl(
                break;
        case Q_XQUOTASYNC:
-                return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL));
+                return xfs_sync_inodes(mp, SYNC_DELWRI);
        default:
                break;
@@ -1022,101 +1022,104 @@ xfs_qm_export_flags(
 /*
- * Go thru all the inodes in the file system, releasing their dquots.
+ * Release all the dquots on the inodes in an AG.
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
- * xfs_rootumount.
 */
-void
+STATIC void
-xfs_qm_dqrele_all_inodes(
+xfs_qm_dqrele_inodes_ag(
-        struct xfs_mount *mp,
+        xfs_mount_t     *mp,
-        uint             flags)
+        int             ag,
+        uint            flags)
 {
-        xfs_inode_t     *ip, *topino;
+        xfs_inode_t     *ip = NULL;
-        uint            ireclaims;
+        xfs_perag_t     *pag = &mp->m_perag[ag];
-        struct inode    *vp;
+        int             first_index = 0;
-        boolean_t       vnode_refd;
+        int             nr_found;
-        ASSERT(mp->m_quotainfo);
-        XFS_MOUNT_ILOCK(mp);
-again:
-        ip = mp->m_inodes;
-        if (ip == NULL) {
-                XFS_MOUNT_IUNLOCK(mp);
-                return;
-        }
        do {
-                /* Skip markers inserted by xfs_sync */
+                /*
-                if (ip->i_mount == NULL) {
+                 * use a gang lookup to find the next inode in the tree
-                        ip = ip->i_mnext;
+                 * as the tree is sparse and a gang lookup walks to find
-                        continue;
+                 * the number of objects requested.
+                 */
+                read_lock(&pag->pag_ici_lock);
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                                (void**)&ip, first_index, 1);
+                if (!nr_found) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
                }
-                /* Root inode, rbmip and rsumip have associated blocks */
+                /*
+                 * Update the index for the next lookup. Catch overflows
+                 * into the next AG range which can occur if we have inodes
+                 * in the last block of the AG and we are currently
+                 * pointing to the last inode.
+                 */
+                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        break;
+                }
+                /* skip quota inodes */
                if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
                        ASSERT(ip->i_udquot == NULL);
                        ASSERT(ip->i_gdquot == NULL);
-                        ip = ip->i_mnext;
+                        read_unlock(&pag->pag_ici_lock);
                        continue;
                }
-                vp = VFS_I(ip);
-                if (!vp) {
+                /*
-                        ASSERT(ip->i_udquot == NULL);
+                 * If we can't get a reference on the inode, it must be
-                        ASSERT(ip->i_gdquot == NULL);
+                 * in reclaim. Leave it for the reclaim code to flush.
-                        ip = ip->i_mnext;
+                 */
+                if (!igrab(VFS_I(ip))) {
+                        read_unlock(&pag->pag_ici_lock);
                        continue;
                }
-                vnode_refd = B_FALSE;
+                read_unlock(&pag->pag_ici_lock);
-                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                        ireclaims = mp->m_ireclaims;
+                /* avoid new inodes though we shouldn't find any here */
-                        topino = mp->m_inodes;
+                if (xfs_iflags_test(ip, XFS_INEW)) {
-                        vp = vn_grab(vp);
+                        IRELE(ip);
-                        if (!vp)
+                        continue;
-                                goto again;
-                        XFS_MOUNT_IUNLOCK(mp);
-                        /* XXX restart limit ? */
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        vnode_refd = B_TRUE;
-                } else {
-                        ireclaims = mp->m_ireclaims;
-                        topino = mp->m_inodes;
-                        XFS_MOUNT_IUNLOCK(mp);
                }
-                /*
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                 * We don't keep the mountlock across the dqrele() call,
-                 * since it can take a while..
-                 */
                if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
                        xfs_qm_dqrele(ip->i_udquot);
                        ip->i_udquot = NULL;
                }
-                if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+                if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
+                    ip->i_gdquot) {
                        xfs_qm_dqrele(ip->i_gdquot);
                        ip->i_gdquot = NULL;
                }
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                xfs_iput(ip, XFS_ILOCK_EXCL);
-                /*
-                 * Wait until we've dropped the ilock and mountlock to
+        } while (nr_found);
-                 * do the vn_rele. Or be condemned to an eternity in the
+}
-                 * inactive code in hell.
-                 */
+/*
-                if (vnode_refd)
+ * Go thru all the inodes in the file system, releasing their dquots.
-                        IRELE(ip);
+ * Note that the mount structure gets modified to indicate that quotas are off
-                XFS_MOUNT_ILOCK(mp);
+ * AFTER this, in the case of quotaoff. This also gets called from
-                /*
+ * xfs_rootumount.
-                 * If an inode was inserted or removed, we gotta
+ */
-                 * start over again.
+void
-                 */
+xfs_qm_dqrele_all_inodes(
-                if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
+        struct xfs_mount *mp,
-                        /* XXX use a sentinel */
+        uint             flags)
-                        goto again;
+{
-                }
+        int             i;
-                ip = ip->i_mnext;
-        } while (ip != mp->m_inodes);
-        XFS_MOUNT_IUNLOCK(mp);
+        ASSERT(mp->m_quotainfo);
+        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+                if (!mp->m_perag[i].pag_ici_init)
+                        continue;
+                xfs_qm_dqrele_inodes_ag(mp, i, flags);
+        }
 }
 /*------------------------------------------------------------------------*/
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index c27abef7b84f..ae5482965424 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -18,6 +18,13 @@
 #include <xfs.h>
 #include "debug.h"
+/* xfs_mount.h drags a lot of crap in, sorry.. */
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
 static char             message[1024];  /* keep it off the stack */
 static DEFINE_SPINLOCK(xfs_err_lock);
@@ -55,22 +62,42 @@ cmn_err(register int level, char *fmt, ...)
 }
 void
-icmn_err(register int level, char *fmt, va_list ap)
+xfs_fs_vcmn_err(
+        int                     level,
+        struct xfs_mount        *mp,
+        char                    *fmt,
+        va_list                 ap)
 {
-        ulong   flags;
+        unsigned long           flags;
-        int     len;
+        int                     len = 0;
        level &= XFS_ERR_MASK;
-        if(level > XFS_MAX_ERR_LEVEL)
+        if (level > XFS_MAX_ERR_LEVEL)
                level = XFS_MAX_ERR_LEVEL;
        spin_lock_irqsave(&xfs_err_lock,flags);
-        len = vsnprintf(message, sizeof(message), fmt, ap);
+        if (mp) {
+                len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+                /*
+                 * Skip the printk if we can't print anything useful
+                 * due to an over-long device name.
+                 */
+                if (len >= sizeof(message))
+                        goto out;
+        }
+        len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
        if (len >= sizeof(message))
                len = sizeof(message) - 1;
        if (message[len-1] == '\n')
                message[len-1] = 0;
        printk("%s%s\n", err_level[level], message);
+ out:
        spin_unlock_irqrestore(&xfs_err_lock,flags);
        BUG_ON(level == CE_PANIC);
 }
@@ -84,5 +111,5 @@ assfail(char *expr, char *file, int line)
 void
 xfs_hex_dump(void *p, int length)
 {
-        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
+        print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
 }
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 75845f950814..6f4fd37c67af 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -27,8 +27,6 @@
 #define CE_ALERT        1               /* alert        */
 #define CE_PANIC        0               /* panic        */
-extern void icmn_err(int, char *, va_list)
-        __attribute__ ((format (printf, 2, 0)));
 extern void cmn_err(int, char *, ...)
        __attribute__ ((format (printf, 2, 3)));
 extern void assfail(char *expr, char *f, int l);
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index a34ef05489b1..2d494c26717f 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -113,21 +113,16 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 void
 ktrace_free(ktrace_t *ktp)
 {
-        int     entries_size;
        if (ktp == (ktrace_t *)NULL)
                return;
        /*
         * Special treatment for the Vnode trace buffer.
         */
-        if (ktp->kt_nentries == ktrace_zentries) {
+        if (ktp->kt_nentries == ktrace_zentries)
                kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
-        } else {
+        else
-                entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
                kmem_free(ktp->kt_entries);
-        }
        kmem_zone_free(ktrace_hdr_zone, ktp);
 }
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c989825..17254b529c54 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -30,7 +30,7 @@
 #define XFS_ATTR_TRACE 1
 #define XFS_BLI_TRACE 1
 #define XFS_BMAP_TRACE 1
-#define XFS_BMBT_TRACE 1
+#define XFS_BTREE_TRACE 1
 #define XFS_DIR2_TRACE 1
 #define XFS_DQUOT_TRACE 1
 #define XFS_ILOCK_TRACE 1
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 91d69338d3b2..a8cdd73999a4 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -758,7 +758,7 @@ xfs_acl_setmode(
        if (gap && nomask)
                iattr.ia_mode |= gap->ae_perm << 3;
-        return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
+        return xfs_setattr(XFS_I(vp), &iattr, 0);
 }
 /*
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 61b292a9fb41..f2e21817a226 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -91,6 +91,8 @@ typedef struct xfs_agf {
 #define XFS_AGF_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
 #define XFS_BUF_TO_AGF(bp)      ((xfs_agf_t *)XFS_BUF_PTR(bp))
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 /*
 * Size of the unlinked inode hash table in the agi.
@@ -142,6 +144,9 @@ typedef struct xfs_agi {
 #define XFS_AGI_BLOCK(mp)       XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
 #define XFS_BUF_TO_AGI(bp)      ((xfs_agi_t *)XFS_BUF_PTR(bp))
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+                                xfs_agnumber_t agno, struct xfs_buf **bpp);
 /*
 * The third a.g. block contains the a.g. freelist, an array
 * of block pointers to blocks owned by the allocation btree code.
@@ -192,17 +197,23 @@ typedef struct xfs_perag
        xfs_agino_t     pagi_freecount; /* number of free inodes */
        xfs_agino_t     pagi_count;     /* number of allocated inodes */
        int             pagb_count;     /* pagb slots in use */
+        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
 #ifdef __KERNEL__
        spinlock_t      pagb_lock;      /* lock for pagb_list */
-#endif
-        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+#endif
 } xfs_perag_t;
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
 #define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
 #define XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
        (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 1956f83489f1..028e44e58ea9 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -90,6 +90,92 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
 */
 /*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_eq(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_alloc_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len,    /* length of extent */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.a.ar_startblock = bno;
+        cur->bc_rec.a.ar_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                              /* error */
+xfs_alloc_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           bno,    /* starting block of extent */
+        xfs_extlen_t            len)    /* length of extent */
+{
+        union xfs_btree_rec     rec;
+        rec.alloc.ar_startblock = cpu_to_be32(bno);
+        rec.alloc.ar_blockcount = cpu_to_be32(len);
+        return xfs_btree_update(cur, &rec);
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+STATIC int                              /* error */
+xfs_alloc_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agblock_t           *bno,   /* output: starting block of extent */
+        xfs_extlen_t            *len,   /* output: length of extent */
+        int                     *stat)  /* output: success/failure */
+{
+        union xfs_btree_rec     *rec;
+        int                     error;
+        error = xfs_btree_get_rec(cur, &rec, stat);
+        if (!error && *stat == 1) {
+                *bno = be32_to_cpu(rec->alloc.ar_startblock);
+                *len = be32_to_cpu(rec->alloc.ar_blockcount);
+        }
+        return error;
+}
+/*
 * Compute aligned version of the found extent.
 * Takes alignment and min length into account.
 */
@@ -294,21 +380,20 @@ xfs_alloc_fixup_trees(
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
 #ifdef DEBUG
-        {
+        if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
-                xfs_alloc_block_t       *bnoblock;
+                struct xfs_btree_block  *bnoblock;
-                xfs_alloc_block_t       *cntblock;
+                struct xfs_btree_block  *cntblock;
-                if (bno_cur->bc_nlevels == 1 &&
+                bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
-                    cnt_cur->bc_nlevels == 1) {
+                cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
-                        bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
-                        cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
+                XFS_WANT_CORRUPTED_RETURN(
-                        XFS_WANT_CORRUPTED_RETURN(
+                        bnoblock->bb_numrecs == cntblock->bb_numrecs);
-                                be16_to_cpu(bnoblock->bb_numrecs) ==
-                                be16_to_cpu(cntblock->bb_numrecs));
-                }
        }
 #endif
        /*
         * Deal with all four cases: the allocated record is contained
         * within the freespace record, so we can have new freespace
@@ -333,7 +418,7 @@ xfs_alloc_fixup_trees(
        /*
         * Delete the entry from the by-size btree.
         */
-        if ((error = xfs_alloc_delete(cnt_cur, &i)))
+        if ((error = xfs_btree_delete(cnt_cur, &i)))
                return error;
        XFS_WANT_CORRUPTED_RETURN(i == 1);
        /*
@@ -343,7 +428,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-                if ((error = xfs_alloc_insert(cnt_cur, &i)))
+                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -351,7 +436,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-                if ((error = xfs_alloc_insert(cnt_cur, &i)))
+                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -362,7 +447,7 @@ xfs_alloc_fixup_trees(
                /*
                 * No remaining freespace, just delete the by-block tree entry.
                 */
-                if ((error = xfs_alloc_delete(bno_cur, &i)))
+                if ((error = xfs_btree_delete(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        } else {
@@ -379,7 +464,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-                if ((error = xfs_alloc_insert(bno_cur, &i)))
+                if ((error = xfs_btree_insert(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -640,8 +725,8 @@ xfs_alloc_ag_vextent_exact(
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
-        bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO, NULL, 0);
+                args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
@@ -696,8 +781,8 @@ xfs_alloc_ag_vextent_exact(
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
         */
-        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_CNT, NULL, 0);
+                args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
@@ -759,8 +844,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Get a cursor for the by-size btree.
         */
-        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_CNT, NULL, 0);
+                args->agno, XFS_BTNUM_CNT);
        ltlen = 0;
        bno_cur_lt = bno_cur_gt = NULL;
        /*
@@ -818,7 +903,7 @@ xfs_alloc_ag_vextent_near(
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (ltlen >= args->minlen)
                                        break;
-                                if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
                                        goto error0;
                        } while (i);
                        ASSERT(ltlen >= args->minlen);
@@ -828,7 +913,7 @@ xfs_alloc_ag_vextent_near(
                i = cnt_cur->bc_ptrs[0];
                for (j = 1, blen = 0, bdiff = 0;
                     !error && j && (blen < args->maxlen || bdiff > 0);
-                     error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+                     error = xfs_btree_increment(cnt_cur, 0, &j)) {
                        /*
                         * For each entry, decide if it's better than
                         * the previous best entry.
@@ -886,8 +971,8 @@ xfs_alloc_ag_vextent_near(
                /*
                 * Set up a cursor for the by-bno tree.
                 */
-                bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
+                bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
-                        args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
+                        args->agbp, args->agno, XFS_BTNUM_BNO);
                /*
                 * Fix up the btree entries.
                 */
@@ -914,8 +999,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Allocate and initialize the cursor for the leftward search.
         */
-        bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO, NULL, 0);
+                args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup <= bno to find the leftward search's starting point.
         */
@@ -938,7 +1023,7 @@ xfs_alloc_ag_vextent_near(
         * Increment the cursor, so we will point at the entry just right
         * of the leftward entry if any, or to the leftmost entry.
         */
-        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                goto error0;
        if (!i) {
                /*
@@ -961,7 +1046,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
                                break;
-                        if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_lt,
@@ -977,7 +1062,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
                                break;
-                        if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_gt,
@@ -1066,7 +1151,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the right end.
                                         */
-                                        if ((error = xfs_alloc_increment(
+                                        if ((error = xfs_btree_increment(
                                                        bno_cur_gt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1162,7 +1247,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the left end.
                                         */
-                                        if ((error = xfs_alloc_decrement(
+                                        if ((error = xfs_btree_decrement(
                                                        bno_cur_lt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1267,8 +1352,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-size btree.
         */
-        cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_CNT, NULL, 0);
+                args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1321,7 +1406,7 @@ xfs_alloc_ag_vextent_size(
                bestflen = flen;
                bestfbno = fbno;
                for (;;) {
-                        if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
                                goto error0;
                        if (i == 0)
                                break;
@@ -1372,8 +1457,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-block tree.
         */
-        bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
+        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-                args->agno, XFS_BTNUM_BNO, NULL, 0);
+                args->agno, XFS_BTNUM_BNO);
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
                        rbno, rlen, XFSA_FIXUP_CNT_OK)))
                goto error0;
@@ -1416,7 +1501,7 @@ xfs_alloc_ag_vextent_small(
        xfs_extlen_t    flen;
        int             i;
-        if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+        if ((error = xfs_btree_decrement(ccur, 0, &i)))
                goto error0;
        if (i) {
                if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
@@ -1515,8 +1600,7 @@ xfs_free_ag_extent(
        /*
         * Allocate and initialize a cursor for the by-block btree.
         */
-        bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
+        bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
-                0);
        cnt_cur = NULL;
        /*
         * Look for a neighboring block on the left (lower block numbers)
@@ -1549,7 +1633,7 @@ xfs_free_ag_extent(
         * Look for a neighboring block on the right (higher block numbers)
         * that is contiguous with this space.
         */
-        if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+        if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
                goto error0;
        if (haveright) {
                /*
@@ -1575,8 +1659,7 @@ xfs_free_ag_extent(
        /*
         * Now allocate and initialize a cursor for the by-size tree.
         */
-        cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
+        cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
-                0);
        /*
         * Have both left and right contiguous neighbors.
         * Merge all three into a single free block.
@@ -1588,7 +1671,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1597,19 +1680,19 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Delete the old by-block entry for the right block.
                 */
-                if ((error = xfs_alloc_delete(bno_cur, &i)))
+                if ((error = xfs_btree_delete(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Move the by-block cursor back to the left neighbor.
                 */
-                if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 #ifdef DEBUG
@@ -1648,14 +1731,14 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Back up the by-block cursor to the left neighbor, and
                 * update its length.
                 */
-                if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                nbno = ltbno;
@@ -1674,7 +1757,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_delete(cnt_cur, &i)))
+                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1693,7 +1776,7 @@ xfs_free_ag_extent(
        else {
                nbno = bno;
                nlen = len;
-                if ((error = xfs_alloc_insert(bno_cur, &i)))
+                if ((error = xfs_btree_insert(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        }
@@ -1705,7 +1788,7 @@ xfs_free_ag_extent(
        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
-        if ((error = xfs_alloc_insert(cnt_cur, &i)))
+        if ((error = xfs_btree_insert(cnt_cur, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -2150,51 +2233,83 @@ xfs_alloc_put_freelist(
 * Read in the allocation group header (free/alloc section).
 */
 int                                     /* error */
-xfs_alloc_read_agf(
+xfs_read_agf(
-        xfs_mount_t     *mp,            /* mount point structure */
+        struct xfs_mount        *mp,    /* mount point structure */
-        xfs_trans_t     *tp,            /* transaction pointer */
+        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agnumber_t          agno,   /* allocation group number */
-        int             flags,          /* XFS_ALLOC_FLAG_... */
+        int                     flags,  /* XFS_BUF_ */
-        xfs_buf_t       **bpp)          /* buffer for the ag freelist header */
+        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
 {
-        xfs_agf_t       *agf;           /* ag freelist header */
+        struct xfs_agf  *agf;           /* ag freelist header */
        int             agf_ok;         /* set if agf is consistent */
-        xfs_buf_t       *bp;            /* return value */
-        xfs_perag_t     *pag;           /* per allocation group data */
        int             error;
        ASSERT(agno != NULLAGNUMBER);
        error = xfs_trans_read_buf(
                        mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1),
+                        XFS_FSS_TO_BB(mp, 1), flags, bpp);
-                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
-                        &bp);
        if (error)
                return error;
-        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
+        if (!*bpp)
-        if (!bp) {
-                *bpp = NULL;
                return 0;
-        }
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        agf = XFS_BUF_TO_AGF(*bpp);
        /*
         * Validate the magic number of the agf block.
         */
-        agf = XFS_BUF_TO_AGF(bp);
        agf_ok =
                be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
                XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
                be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
                be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
                be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
+                be32_to_cpu(agf->agf_seqno) == agno;
+        if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+                agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+                                                be32_to_cpu(agf->agf_length);
        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
                        XFS_RANDOM_ALLOC_READ_AGF))) {
                XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
                                     XFS_ERRLEVEL_LOW, mp, agf);
-                xfs_trans_brelse(tp, bp);
+                xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
+        XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+        return 0;
+}
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int                                     /* error */
+xfs_alloc_read_agf(
+        struct xfs_mount        *mp,    /* mount point structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        int                     flags,  /* XFS_ALLOC_FLAG_... */
+        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
+{
+        struct xfs_agf          *agf;           /* ag freelist header */
+        struct xfs_perag        *pag;           /* per allocation group data */
+        int                     error;
+        ASSERT(agno != NULLAGNUMBER);
+        error = xfs_read_agf(mp, tp, agno,
+                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
+                        bpp);
+        if (error)
+                return error;
+        if (!*bpp)
+                return 0;
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        agf = XFS_BUF_TO_AGF(*bpp);
        pag = &mp->m_perag[agno];
        if (!pag->pagf_init) {
                pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
@@ -2213,6 +2328,7 @@ xfs_alloc_read_agf(
 #ifdef DEBUG
        else if (!XFS_FORCED_SHUTDOWN(mp)) {
                ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+                ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
                ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
                ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
                ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
@@ -2221,8 +2337,6 @@ xfs_alloc_read_agf(
                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
        }
 #endif
-        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
-        *bpp = bp;
        return 0;
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 5aec15d0651e..588172796f7b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -121,6 +121,19 @@ extern ktrace_t *xfs_alloc_trace_buf;
 #define XFS_ALLOC_KTRACE_BUSYSEARCH     6
 #endif
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+                xfs_agnumber_t agno,
+                xfs_agblock_t bno,
+                xfs_extlen_t len);
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+                xfs_agnumber_t ag,
+                int idx);
+#endif  /* __KERNEL__ */
 /*
 * Compute and fill in value of m_ag_maxlevels.
 */
@@ -196,18 +209,4 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
-                xfs_agnumber_t agno,
-                xfs_agblock_t bno,
-                xfs_extlen_t len);
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-                xfs_agnumber_t ag,
-                int idx);
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3ce2645508ae..733cb75a8c5d 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -35,2177 +35,464 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
-/*
- * Prototypes for internal functions.
- */
-STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
+STATIC struct xfs_btree_cur *
-STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+xfs_allocbt_dup_cursor(
-STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+        struct xfs_btree_cur    *cur)
-STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
+{
-STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
+        return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
-STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
+                        cur->bc_private.a.agbp, cur->bc_private.a.agno,
-STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
+                        cur->bc_btnum);
-STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
+}
-                xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
-/*
+STATIC void
- * Internal functions.
+xfs_allocbt_set_root(
- */
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     inc)
+{
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+        int                     btnum = cur->bc_btnum;
-/*
+        ASSERT(ptr->s != 0);
- * Single level of the xfs_alloc_delete record deletion routine.
- * Delete record pointed to by cur/level.
+        agf->agf_roots[btnum] = ptr->s;
- * Remove the record from its block then rebalance the tree.
+        be32_add_cpu(&agf->agf_levels[btnum], inc);
- * Return 0 for error, 1 for done, 2 to go on to the next level.
+        cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
- */
-STATIC int                              /* error */
+        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-xfs_alloc_delrec(
+}
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level removing record from */
+STATIC int
-        int                     *stat)  /* fail/done/go-on */
+xfs_allocbt_alloc_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *start,
+        union xfs_btree_ptr     *new,
+        int                     length,
+        int                     *stat)
 {
-        xfs_agf_t               *agf;   /* allocation group freelist header */
+        int                     error;
-        xfs_alloc_block_t       *block; /* btree block record/key lives in */
+        xfs_agblock_t           bno;
-        xfs_agblock_t           bno;    /* btree block number */
-        xfs_buf_t               *bp;    /* buffer for block */
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_alloc_key_t         key;    /* kp points here if block is level 0 */
-        xfs_agblock_t           lbno;   /* left block's block number */
-        xfs_buf_t               *lbp;   /* left block's buffer pointer */
-        xfs_alloc_block_t       *left;  /* left btree block */
-        xfs_alloc_key_t         *lkp=NULL;      /* left block key pointer */
-        xfs_alloc_ptr_t         *lpp=NULL;      /* left block address pointer */
-        int                     lrecs=0;        /* number of records in left block */
-        xfs_alloc_rec_t         *lrp;   /* left block record pointer */
-        xfs_mount_t             *mp;    /* mount structure */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_agblock_t           rbno;   /* right block's block number */
-        xfs_buf_t               *rbp;   /* right block's buffer pointer */
-        xfs_alloc_block_t       *right; /* right btree block */
-        xfs_alloc_key_t         *rkp;   /* right block key pointer */
-        xfs_alloc_ptr_t         *rpp;   /* right block address pointer */
-        int                     rrecs=0;        /* number of records in right block */
-        int                     numrecs;
-        xfs_alloc_rec_t         *rrp;   /* right block record pointer */
-        xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-        /*
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-         * Get the index of the entry being deleted, check for nothing there.
-         */
+        /* Allocate the new block from the freelist. If we can't, give up.  */
-        ptr = cur->bc_ptrs[level];
+        error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
-        if (ptr == 0) {
+                                       &bno, 1);
-                *stat = 0;
+        if (error) {
-                return 0;
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-        }
-        /*
-         * Get the buffer & block containing the record or key/ptr.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
                return error;
-#endif
+        }
-        /*
-         * Fail if we're off the end of the block.
+        if (bno == NULLAGBLOCK) {
-         */
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (ptr > numrecs) {
                *stat = 0;
                return 0;
        }
-        XFS_STATS_INC(xs_abt_delrec);
-        /*
-         * It's a nonleaf.  Excise the key and ptr being deleted, by
-         * sliding the entries past them down one.
-         * Log the changed areas of the block.
-         */
-        if (level > 0) {
-                lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                                return error;
-                }
-#endif
-                if (ptr < numrecs) {
-                        memmove(&lkp[ptr - 1], &lkp[ptr],
-                                (numrecs - ptr) * sizeof(*lkp));
-                        memmove(&lpp[ptr - 1], &lpp[ptr],
-                                (numrecs - ptr) * sizeof(*lpp));
-                        xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1);
-                        xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1);
-                }
-        }
-        /*
-         * It's a leaf.  Excise the record being deleted, by sliding the
-         * entries past it down one.  Log the changed areas of the block.
-         */
-        else {
-                lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                if (ptr < numrecs) {
-                        memmove(&lrp[ptr - 1], &lrp[ptr],
-                                (numrecs - ptr) * sizeof(*lrp));
-                        xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1);
-                }
-                /*
-                 * If it's the first record in the block, we'll need a key
-                 * structure to pass up to the next level (updkey).
-                 */
-                if (ptr == 1) {
-                        key.ar_startblock = lrp->ar_startblock;
-                        key.ar_blockcount = lrp->ar_blockcount;
-                        lkp = &key;
-                }
-        }
-        /*
-         * Decrement and log the number of entries in the block.
-         */
-        numrecs--;
-        block->bb_numrecs = cpu_to_be16(numrecs);
-        xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-        /*
-         * See if the longest free extent in the allocation group was
-         * changed by this operation.  True if it's the by-size btree, and
-         * this is the leaf level, and there is no right sibling block,
-         * and this was the last record.
-         */
-        agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-        mp = cur->bc_mp;
-        if (level == 0 &&
+        xfs_trans_agbtree_delta(cur->bc_tp, 1);
-            cur->bc_btnum == XFS_BTNUM_CNT &&
+        new->s = cpu_to_be32(bno);
-            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-            ptr > numrecs) {
-                ASSERT(ptr == numrecs + 1);
-                /*
-                 * There are still records in the block.  Grab the size
-                 * from the last one.
-                 */
-                if (numrecs) {
-                        rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur);
-                        agf->agf_longest = rrp->ar_blockcount;
-                }
-                /*
-                 * No free extents left.
-                 */
-                else
-                        agf->agf_longest = 0;
-                mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest =
-                        be32_to_cpu(agf->agf_longest);
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_LONGEST);
-        }
-        /*
-         * Is this the root level?  If so, we're almost done.
-         */
-        if (level == cur->bc_nlevels - 1) {
-                /*
-                 * If this is the root level,
-                 * and there's only one entry left,
-                 * and it's NOT the leaf level,
-                 * then we can get rid of this level.
-                 */
-                if (numrecs == 1 && level > 0) {
-                        /*
-                         * lpp is still set to the first pointer in the block.
-                         * Make it the new root of the btree.
-                         */
-                        bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-                        agf->agf_roots[cur->bc_btnum] = *lpp;
-                        be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1);
-                        mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
-                        /*
-                         * Put this buffer/block on the ag's freelist.
-                         */
-                        error = xfs_alloc_put_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, NULL, bno, 1);
-                        if (error)
-                                return error;
-                        /*
-                         * Since blocks move to the free list without the
-                         * coordination used in xfs_bmap_finish, we can't allow
-                         * block to be available for reallocation and
-                         * non-transaction writing (user data) until we know
-                         * that the transaction that moved it to the free list
-                         * is permanently on disk. We track the blocks by
-                         * declaring these blocks as "busy"; the busy list is
-                         * maintained on a per-ag basis and each transaction
-                         * records which entries should be removed when the
-                         * iclog commits to disk. If a busy block is
-                         * allocated, the iclog is pushed up to the LSN
-                         * that freed the block.
-                         */
-                        xfs_alloc_mark_busy(cur->bc_tp,
-                                be32_to_cpu(agf->agf_seqno), bno, 1);
-                        xfs_trans_agbtree_delta(cur->bc_tp, -1);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-                        xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
+        *stat = 1;
-                                XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+        return 0;
-                        /*
+}
-                         * Update the cursor so there's one fewer level.
-                         */
-                        xfs_btree_setbuf(cur, level, NULL);
-                        cur->bc_nlevels--;
-                } else if (level > 0 &&
-                           (error = xfs_alloc_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we deleted the leftmost entry in the block, update the
-         * key values above us in the tree.
-         */
-        if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
-                return error;
-        /*
-         * If the number of records remaining in the block is at least
-         * the minimum, we're done.
-         */
-        if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Otherwise, we have to move some records around to keep the
-         * tree balanced.  Look at the left and right sibling blocks to
-         * see if we can re-balance by moving only one record.
-         */
-        rbno = be32_to_cpu(block->bb_rightsib);
-        lbno = be32_to_cpu(block->bb_leftsib);
-        bno = NULLAGBLOCK;
-        ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-        /*
-         * Duplicate the cursor so our btree manipulations here won't
-         * disrupt the next level up.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        /*
-         * If there's a right sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (rbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the last entry in the next block.
-                 * Actually any entry but the first would suffice.
-                 */
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_increment(tcur, level, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                rbp = tcur->bc_bufs[level];
-                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(right->bb_leftsib);
-                /*
-                 * If right block is full enough so that removing one entry
-                 * won't make it too empty, and left-shifting an entry out
-                 * of right to us works, we're done.
-                 */
-                if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_alloc_lshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level > 0 &&
-                                    (error = xfs_alloc_decrement(cur, level,
-                                            &i)))
-                                        return error;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference, and fix up the temp cursor to point
-                 * to our block again (last record).
-                 */
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if (lbno != NULLAGBLOCK) {
-                        i = xfs_btree_firstrec(tcur, level);
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                }
-        }
-        /*
-         * If there's a left sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (lbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the first entry in the
-                 * previous block.
-                 */
-                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                xfs_btree_firstrec(tcur, level);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                lbp = tcur->bc_bufs[level];
-                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(left->bb_rightsib);
-                /*
-                 * If left block is full enough so that removing one entry
-                 * won't make it too empty, and right-shifting an entry out
-                 * of left to us works, we're done.
-                 */
-                if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                     XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_alloc_rshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level == 0)
-                                        cur->bc_ptrs[0]++;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference.
-                 */
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        }
-        /*
-         * Delete the temp cursor, we're done with it.
-         */
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        /*
-         * If here, we need to do a join to keep the tree balanced.
-         */
-        ASSERT(bno != NULLAGBLOCK);
-        /*
-         * See if we can join with the left neighbor block.
-         */
-        if (lbno != NULLAGBLOCK &&
-            lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "right" to be the starting block,
-                 * "left" to be the left neighbor.
-                 */
-                rbno = bno;
-                right = block;
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                rbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, lbno, 0, &lbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        return error;
-        }
-        /*
-         * If that won't work, see if we can join with the right neighbor block.
-         */
-        else if (rbno != NULLAGBLOCK &&
-                 rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "left" to be the starting block,
-                 * "right" to be the right neighbor.
-                 */
-                lbno = bno;
-                left = block;
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                lbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, rbno, 0, &rbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        return error;
-        }
-        /*
-         * Otherwise, we can't fix the imbalance.
-         * Just return.  This is probably a logic error, but it's not fatal.
-         */
-        else {
-                if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * We're now going to join "left" and "right" by moving all the stuff
-         * in "right" to "left" and deleting "right".
-         */
-        if (level > 0) {
-                /*
-                 * It's a non-leaf.  Move keys and pointers.
-                 */
-                lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur);
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-                xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-                xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        } else {
-                /*
-                 * It's a leaf.  Move records.
-                 */
-                lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur);
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-                xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        }
-        /*
-         * If we joined with the left neighbor, set the buffer in the
-         * cursor to the left block, and fix up the index.
-         */
-        if (bp != lbp) {
-                xfs_btree_setbuf(cur, level, lbp);
-                cur->bc_ptrs[level] += lrecs;
-        }
-        /*
-         * If we joined with the right neighbor and there's a level above
-         * us, increment the cursor at that level.
-         */
-        else if (level + 1 < cur->bc_nlevels &&
-                 (error = xfs_alloc_increment(cur, level + 1, &i)))
-                return error;
-        /*
-         * Fix up the number of records in the surviving block.
-         */
-        lrecs += rrecs;
-        left->bb_numrecs = cpu_to_be16(lrecs);
-        /*
-         * Fix up the right block pointer in the surviving block, and log it.
-         */
-        left->bb_rightsib = right->bb_rightsib;
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there is a right sibling now, make it point to the
-         * remaining block.
-         */
-        if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-                xfs_alloc_block_t       *rrblock;
-                xfs_buf_t               *rrbp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+STATIC int
-                                cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
+xfs_allocbt_free_block(
-                                &rrbp, XFS_ALLOC_BTREE_REF)))
+        struct xfs_btree_cur    *cur,
-                        return error;
+        struct xfs_buf          *bp)
-                rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+{
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-                        return error;
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-                rrblock->bb_leftsib = cpu_to_be32(lbno);
+        xfs_agblock_t           bno;
-                xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+        int                     error;
-        }
-        /*
+        bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
-         * Free the deleting block by putting it on the freelist.
+        error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
-         */
-        error = xfs_alloc_put_freelist(cur->bc_tp,
-                                         cur->bc_private.a.agbp, NULL, rbno, 1);
        if (error)
                return error;
        /*
-         * Since blocks move to the free list without the coordination
+         * Since blocks move to the free list without the coordination used in
-         * used in xfs_bmap_finish, we can't allow block to be available
+         * xfs_bmap_finish, we can't allow block to be available for
-         * for reallocation and non-transaction writing (user data)
+         * reallocation and non-transaction writing (user data) until we know
-         * until we know that the transaction that moved it to the free
+         * that the transaction that moved it to the free list is permanently
-         * list is permanently on disk. We track the blocks by declaring
+         * on disk. We track the blocks by declaring these blocks as "busy";
-         * these blocks as "busy"; the busy list is maintained on a
+         * the busy list is maintained on a per-ag basis and each transaction
-         * per-ag basis and each transaction records which entries
+         * records which entries should be removed when the iclog commits to
-         * should be removed when the iclog commits to disk. If a
+         * disk. If a busy block is allocated, the iclog is pushed up to the
-         * busy block is allocated, the iclog is pushed up to the
         * LSN that freed the block.
         */
        xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
-        /*
-         * Adjust the current level's cursor so that we're left referring
-         * to the right node, after we're done.
-         * If this leaves the ptr value 0 our caller will fix it up.
-         */
-        if (level > 0)
-                cur->bc_ptrs[level]--;
-        /*
-         * Return value means the next level up has something to do.
-         */
-        *stat = 2;
        return 0;
-error0:
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
 }
 /*
- * Insert one record/level.  Return information to the caller
+ * Update the longest extent in the AGF
- * allowing the next level up to proceed if necessary.
 */
-STATIC int                              /* error */
+STATIC void
-xfs_alloc_insrec(
+xfs_allocbt_update_lastrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,
-        int                     level,  /* level to insert record at */
+        struct xfs_btree_block  *block,
-        xfs_agblock_t           *bnop,  /* i/o: block number inserted */
+        union xfs_btree_rec     *rec,
-        xfs_alloc_rec_t         *recp,  /* i/o: record data inserted */
+        int                     ptr,
-        xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
+        int                     reason)
-        int                     *stat)  /* output: success/failure */
 {
-        xfs_agf_t               *agf;   /* allocation group freelist header */
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-        xfs_alloc_block_t       *block; /* btree block record/key lives in */
+        xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
-        xfs_buf_t               *bp;    /* buffer for block */
+        __be32                  len;
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_alloc_key_t         key;    /* key value being inserted */
-        xfs_alloc_key_t         *kp;    /* pointer to btree keys */
-        xfs_agblock_t           nbno;   /* block number of allocated block */
-        xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-        xfs_alloc_key_t         nkey;   /* new key value, from split */
-        xfs_alloc_rec_t         nrec;   /* new record value, for caller */
        int                     numrecs;
-        int                     optr;   /* old ptr value */
-        xfs_alloc_ptr_t         *pp;    /* pointer to btree addresses */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_alloc_rec_t         *rp;    /* pointer to btree records */
-        ASSERT(be32_to_cpu(recp->ar_blockcount) > 0);
+        ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+        switch (reason) {
+        case LASTREC_UPDATE:
+                /*
+                 * If this is the last leaf block and it's the last record,
+                 * then update the size of the longest extent in the AG.
+                 */
+                if (ptr != xfs_btree_get_numrecs(block))
+                        return;
+                len = rec->alloc.ar_blockcount;
+                break;
+        case LASTREC_INSREC:
+                if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+                    be32_to_cpu(agf->agf_longest))
+                        return;
+                len = rec->alloc.ar_blockcount;
+                break;
+        case LASTREC_DELREC:
+                numrecs = xfs_btree_get_numrecs(block);
+                if (ptr <= numrecs)
+                        return;
+                ASSERT(ptr == numrecs + 1);
-        /*
+                if (numrecs) {
-         * GCC doesn't understand the (arguably complex) control flow in
+                        xfs_alloc_rec_t *rrp;
-         * this function and complains about uninitialized structure fields
-         * without this.
-         */
-        memset(&nrec, 0, sizeof(nrec));
-        /*
+                        rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
-         * If we made it to the root level, allocate a new root block
+                        len = rrp->ar_blockcount;
-         * and we're done.
-         */
-        if (level >= cur->bc_nlevels) {
-                XFS_STATS_INC(xs_abt_insrec);
-                if ((error = xfs_alloc_newroot(cur, &i)))
-                        return error;
-                *bnop = NULLAGBLOCK;
-                *stat = i;
-                return 0;
-        }
-        /*
-         * Make a key out of the record data to be inserted, and save it.
-         */
-        key.ar_startblock = recp->ar_startblock;
-        key.ar_blockcount = recp->ar_blockcount;
-        optr = ptr = cur->bc_ptrs[level];
-        /*
-         * If we're off the left edge, return failure.
-         */
-        if (ptr == 0) {
-                *stat = 0;
-                return 0;
-        }
-        XFS_STATS_INC(xs_abt_insrec);
-        /*
-         * Get pointers to the btree buffer and block.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-        /*
-         * Check that the new entry is being inserted in the right place.
-         */
-        if (ptr <= numrecs) {
-                if (level == 0) {
-                        rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                        xfs_btree_check_rec(cur->bc_btnum, recp, rp);
                } else {
-                        kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
+                        len = 0;
-                        xfs_btree_check_key(cur->bc_btnum, &key, kp);
-                }
-        }
-#endif
-        nbno = NULLAGBLOCK;
-        ncur = NULL;
-        /*
-         * If the block is full, we can't insert the new entry until we
-         * make the block un-full.
-         */
-        if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * First, try shifting an entry to the right neighbor.
-                 */
-                if ((error = xfs_alloc_rshift(cur, level, &i)))
-                        return error;
-                if (i) {
-                        /* nothing */
-                }
-                /*
-                 * Next, try shifting an entry to the left neighbor.
-                 */
-                else {
-                        if ((error = xfs_alloc_lshift(cur, level, &i)))
-                                return error;
-                        if (i)
-                                optr = ptr = cur->bc_ptrs[level];
-                        else {
-                                /*
-                                 * Next, try splitting the current block in
-                                 * half. If this works we have to re-set our
-                                 * variables because we could be in a
-                                 * different block now.
-                                 */
-                                if ((error = xfs_alloc_split(cur, level, &nbno,
-                                                &nkey, &ncur, &i)))
-                                        return error;
-                                if (i) {
-                                        bp = cur->bc_bufs[level];
-                                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                                        if ((error =
-                                                xfs_btree_check_sblock(cur,
-                                                        block, level, bp)))
-                                                return error;
-#endif
-                                        ptr = cur->bc_ptrs[level];
-                                        nrec.ar_startblock = nkey.ar_startblock;
-                                        nrec.ar_blockcount = nkey.ar_blockcount;
-                                }
-                                /*
-                                 * Otherwise the insert fails.
-                                 */
-                                else {
-                                        *stat = 0;
-                                        return 0;
-                                }
-                        }
-                }
-        }
-        /*
-         * At this point we know there's room for our new entry in the block
-         * we're pointing at.
-         */
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (level > 0) {
-                /*
-                 * It's a non-leaf entry.  Make a hole for the new data
-                 * in the key and ptr regions of the block.
-                 */
-                kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                                return error;
                }
-#endif
-                memmove(&kp[ptr], &kp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*kp));
-                memmove(&pp[ptr], &pp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                        return error;
-#endif
-                /*
-                 * Now stuff the new data in, bump numrecs and log the new data.
-                 */
-                kp[ptr - 1] = key;
-                pp[ptr - 1] = cpu_to_be32(*bnop);
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_alloc_log_keys(cur, bp, ptr, numrecs);
-                xfs_alloc_log_ptrs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-                if (ptr < numrecs)
-                        xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                                kp + ptr);
-#endif
-        } else {
-                /*
-                 * It's a leaf entry.  Make a hole for the new record.
-                 */
-                rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                memmove(&rp[ptr], &rp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*rp));
-                /*
-                 * Now stuff the new record in, bump numrecs
-                 * and log the new data.
-                 */
-                rp[ptr - 1] = *recp;
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_alloc_log_recs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-                if (ptr < numrecs)
-                        xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                                rp + ptr);
-#endif
-        }
-        /*
-         * Log the new number of records in the btree header.
-         */
-        xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-        /*
-         * If we inserted at the start of a block, update the parents' keys.
-         */
-        if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
-                return error;
-        /*
-         * Look to see if the longest extent in the allocation group
-         * needs to be updated.
-         */
-        agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+                break;
-        if (level == 0 &&
+        default:
-            cur->bc_btnum == XFS_BTNUM_CNT &&
+                ASSERT(0);
-            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
+                return;
-            be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) {
-                /*
-                 * If this is a leaf in the by-size btree and there
-                 * is no right sibling block and this block is bigger
-                 * than the previous longest block, update it.
-                 */
-                agf->agf_longest = recp->ar_blockcount;
-                cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest
-                        = be32_to_cpu(recp->ar_blockcount);
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_LONGEST);
        }
-        /*
-         * Return the new block number, if any.
+        agf->agf_longest = len;
-         * If there is one, give back a record value and a cursor too.
+        cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
-         */
+        xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
-        *bnop = nbno;
-        if (nbno != NULLAGBLOCK) {
-                *recp = nrec;
-                *curp = ncur;
-        }
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Log header fields from a btree block.
+xfs_allocbt_get_minrecs(
- */
+        struct xfs_btree_cur    *cur,
-STATIC void
+        int                     level)
-xfs_alloc_log_block(
-        xfs_trans_t             *tp,    /* transaction pointer */
-        xfs_buf_t               *bp,    /* buffer containing btree block */
-        int                     fields) /* mask of fields: XFS_BB_... */
 {
-        int                     first;  /* first byte offset logged */
+        return cur->bc_mp->m_alloc_mnr[level != 0];
-        int                     last;   /* last byte offset logged */
+}
-        static const short      offsets[] = {   /* table of offsets */
-                offsetof(xfs_alloc_block_t, bb_magic),
-                offsetof(xfs_alloc_block_t, bb_level),
-                offsetof(xfs_alloc_block_t, bb_numrecs),
-                offsetof(xfs_alloc_block_t, bb_leftsib),
-                offsetof(xfs_alloc_block_t, bb_rightsib),
-                sizeof(xfs_alloc_block_t)
-        };
-        xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+STATIC int
-        xfs_trans_log_buf(tp, bp, first, last);
+xfs_allocbt_get_maxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        return cur->bc_mp->m_alloc_mxr[level != 0];
 }
-/*
- * Log keys from a btree block (nonleaf).
- */
 STATIC void
-xfs_alloc_log_keys(
+xfs_allocbt_init_key_from_rec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     kfirst, /* index of first key to log */
-        int                     klast)  /* index of last key to log */
 {
-        xfs_alloc_block_t       *block; /* btree block to log from */
+        ASSERT(rec->alloc.ar_startblock != 0);
-        int                     first;  /* first byte offset logged */
-        xfs_alloc_key_t         *kp;    /* key pointer in btree block */
-        int                     last;   /* last byte offset logged */
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        key->alloc.ar_startblock = rec->alloc.ar_startblock;
-        kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
+        key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
-        first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
 STATIC void
-xfs_alloc_log_ptrs(
+xfs_allocbt_init_rec_from_key(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     pfirst, /* index of first pointer to log */
-        int                     plast)  /* index of last pointer to log */
 {
-        xfs_alloc_block_t       *block; /* btree block to log from */
+        ASSERT(key->alloc.ar_startblock != 0);
-        int                     first;  /* first byte offset logged */
-        int                     last;   /* last byte offset logged */
-        xfs_alloc_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        rec->alloc.ar_startblock = key->alloc.ar_startblock;
-        pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
+        rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
-        first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log records from a btree block (leaf).
- */
 STATIC void
-xfs_alloc_log_recs(
+xfs_allocbt_init_rec_from_cur(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     rfirst, /* index of first record to log */
-        int                     rlast)  /* index of last record to log */
 {
-        xfs_alloc_block_t       *block; /* btree block to log from */
+        ASSERT(cur->bc_rec.a.ar_startblock != 0);
-        int                     first;  /* first byte offset logged */
-        int                     last;   /* last byte offset logged */
-        xfs_alloc_rec_t         *rp;    /* record pointer for btree block */
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+        rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
-        rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
+        rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-#ifdef DEBUG
-        {
-                xfs_agf_t       *agf;
-                xfs_alloc_rec_t *p;
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
-                        ASSERT(be32_to_cpu(p->ar_startblock) +
-                               be32_to_cpu(p->ar_blockcount) <=
-                               be32_to_cpu(agf->agf_length));
-        }
-#endif
-        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
+STATIC void
- * Lookup the record.  The cursor is made to point to it, based on dir.
+xfs_allocbt_init_ptr_from_cur(
- * Return 0 if can't find any such record, 1 for success.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_ptr     *ptr)
-STATIC int                              /* error */
-xfs_alloc_lookup(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_lookup_t            dir,    /* <=, ==, or >= */
-        int                     *stat)  /* success/failure */
 {
-        xfs_agblock_t           agbno;  /* a.g. relative btree block number */
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_alloc_block_t       *block=NULL;    /* current btree block */
-        int                     diff;   /* difference for the current key */
-        int                     error;  /* error return value */
-        int                     keyno=0;        /* current key number */
-        int                     level;  /* level in the btree */
-        xfs_mount_t             *mp;    /* file system mount point */
-        XFS_STATS_INC(xs_abt_lookup);
-        /*
-         * Get the allocation group header, and the root block number.
-         */
-        mp = cur->bc_mp;
-        {
-                xfs_agf_t       *agf;   /* a.g. freespace header */
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                agno = be32_to_cpu(agf->agf_seqno);
-                agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-        }
-        /*
-         * Iterate over each level in the btree, starting at the root.
-         * For each level above the leaves, find the key we need, based
-         * on the lookup record, then follow the corresponding block
-         * pointer down to the next level.
-         */
-        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-                xfs_buf_t       *bp;    /* buffer pointer for btree block */
-                xfs_daddr_t     d;      /* disk address of btree block */
-                /*
-                 * Get the disk address we're looking for.
-                 */
-                d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                /*
-                 * If the old buffer at this level is for a different block,
-                 * throw it away, otherwise just use it.
-                 */
-                bp = cur->bc_bufs[level];
-                if (bp && XFS_BUF_ADDR(bp) != d)
-                        bp = NULL;
-                if (!bp) {
-                        /*
-                         * Need to get a new buffer.  Read it, then
-                         * set it in the cursor, releasing the old one.
-                         */
-                        if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
-                                        agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
-                                return error;
-                        xfs_btree_setbuf(cur, level, bp);
-                        /*
-                         * Point to the btree block, now that we have the buffer
-                         */
-                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                        if ((error = xfs_btree_check_sblock(cur, block, level,
-                                        bp)))
-                                return error;
-                } else
-                        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                /*
-                 * If we already had a key match at a higher level, we know
-                 * we need to use the first entry in this block.
-                 */
-                if (diff == 0)
-                        keyno = 1;
-                /*
-                 * Otherwise we need to search this block.  Do a binary search.
-                 */
-                else {
-                        int             high;   /* high entry number */
-                        xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
-                        xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
-                        int             low;    /* low entry number */
-                        /*
-                         * Get a pointer to keys or records.
-                         */
-                        if (level > 0)
-                                kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                        else
-                                krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                        /*
-                         * Set low and high entry numbers, 1-based.
-                         */
-                        low = 1;
-                        if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                                /*
-                                 * If the block is empty, the tree must
-                                 * be an empty leaf.
-                                 */
-                                ASSERT(level == 0 && cur->bc_nlevels == 1);
-                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                                *stat = 0;
-                                return 0;
-                        }
-                        /*
-                         * Binary search the block.
-                         */
-                        while (low <= high) {
-                                xfs_extlen_t    blockcount;     /* key value */
-                                xfs_agblock_t   startblock;     /* key value */
-                                XFS_STATS_INC(xs_abt_compare);
-                                /*
-                                 * keyno is average of low and high.
-                                 */
-                                keyno = (low + high) >> 1;
-                                /*
-                                 * Get startblock & blockcount.
-                                 */
-                                if (level > 0) {
-                                        xfs_alloc_key_t *kkp;
-                                        kkp = kkbase + keyno - 1;
-                                        startblock = be32_to_cpu(kkp->ar_startblock);
-                                        blockcount = be32_to_cpu(kkp->ar_blockcount);
-                                } else {
-                                        xfs_alloc_rec_t *krp;
-                                        krp = krbase + keyno - 1;
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
-                                        startblock = be32_to_cpu(krp->ar_startblock);
+        ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
-                                        blockcount = be32_to_cpu(krp->ar_blockcount);
-                                }
-                                /*
-                                 * Compute difference to get next direction.
-                                 */
-                                if (cur->bc_btnum == XFS_BTNUM_BNO)
-                                        diff = (int)startblock -
-                                               (int)cur->bc_rec.a.ar_startblock;
-                                else if (!(diff = (int)blockcount -
-                                            (int)cur->bc_rec.a.ar_blockcount))
-                                        diff = (int)startblock -
-                                            (int)cur->bc_rec.a.ar_startblock;
-                                /*
-                                 * Less than, move right.
-                                 */
-                                if (diff < 0)
-                                        low = keyno + 1;
-                                /*
-                                 * Greater than, move left.
-                                 */
-                                else if (diff > 0)
-                                        high = keyno - 1;
-                                /*
-                                 * Equal, we're done.
-                                 */
-                                else
-                                        break;
-                        }
-                }
-                /*
-                 * If there are more levels, set up for the next level
-                 * by getting the block number and filling in the cursor.
-                 */
-                if (level > 0) {
-                        /*
-                         * If we moved left, need the previous key number,
-                         * unless there isn't one.
-                         */
-                        if (diff > 0 && --keyno < 1)
-                                keyno = 1;
-                        agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                        if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                                return error;
-#endif
-                        cur->bc_ptrs[level] = keyno;
-                }
-        }
-        /*
-         * Done with the search.
-         * See if we need to adjust the results.
-         */
-        if (dir != XFS_LOOKUP_LE && diff < 0) {
-                keyno++;
-                /*
-                 * If ge search and we went off the end of the block, but it's
-                 * not the last block, we're in the wrong block.
-                 */
-                if (dir == XFS_LOOKUP_GE &&
-                    keyno > be16_to_cpu(block->bb_numrecs) &&
-                    be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                        int     i;
-                        cur->bc_ptrs[0] = keyno;
+        ptr->s = agf->agf_roots[cur->bc_btnum];
-                        if ((error = xfs_alloc_increment(cur, 0, &i)))
-                                return error;
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
-                        *stat = 1;
-                        return 0;
-                }
-        }
-        else if (dir == XFS_LOOKUP_LE && diff > 0)
-                keyno--;
-        cur->bc_ptrs[0] = keyno;
-        /*
-         * Return if we succeeded or not.
-         */
-        if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-                *stat = 0;
-        else
-                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-        return 0;
 }
-/*
+STATIC __int64_t
- * Move 1 record left from cur/level if possible.
+xfs_allocbt_key_diff(
- * Update cur to reflect the new path.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_key     *key)
-STATIC int                              /* error */
-xfs_alloc_lshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        xfs_alloc_rec_incore_t  *rec = &cur->bc_rec.a;
-#ifdef DEBUG
+        xfs_alloc_key_t         *kp = &key->alloc;
-        int                     i;      /* loop index */
+        __int64_t               diff;
-#endif
-        xfs_alloc_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-        xfs_alloc_block_t       *left;  /* left neighbor btree block */
-        int                     nrec;   /* new number of left block entries */
-        xfs_buf_t               *rbp;   /* buffer for right (current) block */
-        xfs_alloc_block_t       *right; /* right (current) btree block */
-        xfs_alloc_key_t         *rkp=NULL;      /* key pointer for right block */
-        xfs_alloc_ptr_t         *rpp=NULL;      /* address pointer for right block */
-        xfs_alloc_rec_t         *rrp=NULL;      /* record pointer for right block */
-        /*
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
-         * Set up variables for this block as "right".
+                return (__int64_t)be32_to_cpu(kp->ar_startblock) -
-         */
+                                rec->ar_startblock;
-        rbp = cur->bc_bufs[level];
-        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-#endif
-        /*
-         * If we've got no left sibling then we can't shift an entry left.
-         */
-        if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] <= 1) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the left neighbor as "left".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                        0, &lbp, XFS_ALLOC_BTREE_REF)))
-                return error;
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
        }
-        nrec = be16_to_cpu(left->bb_numrecs) + 1;
-        /*
-         * If non-leaf, copy a key and a ptr to the left block.
-         */
-        if (level > 0) {
-                xfs_alloc_key_t *lkp;   /* key pointer for left block */
-                xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-                lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
+        diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
+        if (diff)
-                *lkp = *rkp;
+                return diff;
-                xfs_alloc_log_keys(cur, lbp, nrec, nrec);
-                lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                        return error;
-#endif
-                *lpp = *rpp;
-                xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
-                xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-        }
-        /*
-         * If leaf, copy a record to the left block.
-         */
-        else {
-                xfs_alloc_rec_t *lrp;   /* record pointer for left block */
-                lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
+        return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                *lrp = *rrp;
-                xfs_alloc_log_recs(cur, lbp, nrec, nrec);
-                xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-        }
-        /*
-         * Bump and log left's numrecs, decrement and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, 1);
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, -1);
-        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Slide the contents of right down one entry.
-         */
-        if (level > 0) {
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                        level)))
-                                return error;
-                }
-#endif
-                memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-        } else {
-                memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                key.ar_startblock = rrp->ar_startblock;
-                key.ar_blockcount = rrp->ar_blockcount;
-                rkp = &key;
-        }
-        /*
-         * Update the parent key values of right.
-         */
-        if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
-                return error;
-        /*
-         * Slide the cursor value left one.
-         */
-        cur->bc_ptrs[level]--;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Allocate a new root block, fill it in.
+xfs_allocbt_kill_root(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        struct xfs_buf          *bp,
-xfs_alloc_newroot(
+        int                     level,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_ptr     *newroot)
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        int                     error;
-        xfs_agblock_t           lbno;   /* left block number */
-        xfs_buf_t               *lbp;   /* left btree buffer */
-        xfs_alloc_block_t       *left;  /* left btree block */
-        xfs_mount_t             *mp;    /* mount structure */
-        xfs_agblock_t           nbno;   /* new block number */
-        xfs_buf_t               *nbp;   /* new (root) buffer */
-        xfs_alloc_block_t       *new;   /* new (root) btree block */
-        int                     nptr;   /* new value for key index, 1 or 2 */
-        xfs_agblock_t           rbno;   /* right block number */
-        xfs_buf_t               *rbp;   /* right btree buffer */
-        xfs_alloc_block_t       *right; /* right btree block */
-        mp = cur->bc_mp;
-        ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-        /*
+        XFS_BTREE_STATS_INC(cur, killroot);
-         * Get a buffer from the freelist blocks, for the new root.
-         */
-        error = xfs_alloc_get_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, &nbno, 1);
-        if (error)
-                return error;
-        /*
-         * None available, we fail.
-         */
-        if (nbno == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        xfs_trans_agbtree_delta(cur->bc_tp, 1);
-        nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
-                0);
-        new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
-        /*
-         * Set the root data in the a.g. freespace structure.
-         */
-        {
-                xfs_agf_t       *agf;   /* a.g. freespace header */
-                xfs_agnumber_t  seqno;
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
-                be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1);
-                seqno = be32_to_cpu(agf->agf_seqno);
-                mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-        }
        /*
-         * At the previous root level there are now two blocks: the old
+         * Update the root pointer, decreasing the level by 1 and then
-         * root, and the new block generated when it was split.
+         * free the old root.
-         * We don't know which one the cursor is pointing at, so we
-         * set up variables "left" and "right" for each case.
         */
-        lbp = cur->bc_bufs[cur->bc_nlevels - 1];
+        xfs_allocbt_set_root(cur, newroot, -1);
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+        error = xfs_allocbt_free_block(cur, bp);
-#ifdef DEBUG
+        if (error) {
-        if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
-#endif
-        if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-                /*
-                 * Our block is left, pick up the right block.
-                 */
-                lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
-                rbno = be32_to_cpu(left->bb_rightsib);
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, rbno, 0, &rbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-                if ((error = xfs_btree_check_sblock(cur, right,
-                                cur->bc_nlevels - 1, rbp)))
-                        return error;
-                nptr = 1;
-        } else {
-                /*
-                 * Our block is right, pick up the left block.
-                 */
-                rbp = lbp;
-                right = left;
-                rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
-                lbno = be32_to_cpu(right->bb_leftsib);
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, lbno, 0, &lbp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-                if ((error = xfs_btree_check_sblock(cur, left,
-                                cur->bc_nlevels - 1, lbp)))
-                        return error;
-                nptr = 2;
        }
-        /*
-         * Fill in the new block's btree header and log it.
-         */
-        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-        new->bb_level = cpu_to_be16(cur->bc_nlevels);
-        new->bb_numrecs = cpu_to_be16(2);
-        new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-        new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-        xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
-        ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-        /*
-         * Fill in the key data in the new root.
-         */
-        {
-                xfs_alloc_key_t         *kp;    /* btree key pointer */
-                kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
+        XFS_BTREE_STATS_INC(cur, free);
-                if (be16_to_cpu(left->bb_level) > 0) {
-                        kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur);
-                        kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                } else {
-                        xfs_alloc_rec_t *rp;    /* btree record pointer */
-                        rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
+        xfs_btree_setbuf(cur, level, NULL);
-                        kp[0].ar_startblock = rp->ar_startblock;
+        cur->bc_nlevels--;
-                        kp[0].ar_blockcount = rp->ar_blockcount;
-                        rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                        kp[1].ar_startblock = rp->ar_startblock;
-                        kp[1].ar_blockcount = rp->ar_blockcount;
-                }
-        }
-        xfs_alloc_log_keys(cur, nbp, 1, 2);
-        /*
-         * Fill in the pointer data in the new root.
-         */
-        {
-                xfs_alloc_ptr_t         *pp;    /* btree address pointer */
-                pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-                pp[0] = cpu_to_be32(lbno);
-                pp[1] = cpu_to_be32(rbno);
-        }
-        xfs_alloc_log_ptrs(cur, nbp, 1, 2);
-        /*
-         * Fix up the cursor.
-         */
-        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-        cur->bc_ptrs[cur->bc_nlevels] = nptr;
-        cur->bc_nlevels++;
-        *stat = 1;
        return 0;
 }
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                              /* error */
-xfs_alloc_rshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
-{
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_alloc_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left (current) block */
-        xfs_alloc_block_t       *left;  /* left (current) btree block */
-        xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-        xfs_alloc_block_t       *right; /* right neighbor btree block */
-        xfs_alloc_key_t         *rkp;   /* key pointer for right block */
-        xfs_btree_cur_t         *tcur;  /* temporary cursor */
-        /*
-         * Set up variables for this block as "left".
-         */
-        lbp = cur->bc_bufs[level];
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-#endif
-        /*
-         * If we've got no right sibling then we can't shift an entry right.
-         */
-        if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the right neighbor as "right".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                        0, &rbp, XFS_ALLOC_BTREE_REF)))
-                return error;
-        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Make a hole at the start of the right neighbor block, then
-         * copy the last left block entry to the hole.
-         */
-        if (level > 0) {
-                xfs_alloc_key_t *lkp;   /* key pointer for left block */
-                xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-                xfs_alloc_ptr_t *rpp;   /* address pointer for right block */
-                lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
 #ifdef DEBUG
-                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
+STATIC int
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
+xfs_allocbt_keys_inorder(
-                                return error;
+        struct xfs_btree_cur    *cur,
-                }
+        union xfs_btree_key     *k1,
-#endif
+        union xfs_btree_key     *k2)
-                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
+{
-                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
-#ifdef DEBUG
+                return be32_to_cpu(k1->alloc.ar_startblock) <
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
+                       be32_to_cpu(k2->alloc.ar_startblock);
-                        return error;
-#endif
-                *rkp = *lkp;
-                *rpp = *lpp;
-                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
        } else {
-                xfs_alloc_rec_t *lrp;   /* record pointer for left block */
+                return be32_to_cpu(k1->alloc.ar_blockcount) <
-                xfs_alloc_rec_t *rrp;   /* record pointer for right block */
+                        be32_to_cpu(k2->alloc.ar_blockcount) ||
+                        (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
-                lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
+                         be32_to_cpu(k1->alloc.ar_startblock) <
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+                         be32_to_cpu(k2->alloc.ar_startblock));
-                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                *rrp = *lrp;
-                xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.ar_startblock = rrp->ar_startblock;
-                key.ar_blockcount = rrp->ar_blockcount;
-                rkp = &key;
-                xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
        }
-        /*
-         * Decrement and log left's numrecs, bump and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, -1);
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, 1);
-        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Using a temporary cursor, update the parent key values of the
-         * block on the right.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        i = xfs_btree_lastrec(tcur, level);
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_alloc_increment(tcur, level, &i)) ||
-            (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
-                goto error0;
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        *stat = 1;
-        return 0;
-error0:
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
 }
-/*
+STATIC int
- * Split cur/level block in half.
+xfs_allocbt_recs_inorder(
- * Return new block number and its first record (to be inserted into parent).
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_rec     *r1,
-STATIC int                              /* error */
+        union xfs_btree_rec     *r2)
-xfs_alloc_split(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to split */
-        xfs_agblock_t           *bnop,  /* output: block number allocated */
-        xfs_alloc_key_t         *keyp,  /* output: first key of new block */
-        xfs_btree_cur_t         **curp, /* output: new cursor */
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        if (cur->bc_btnum == XFS_BTNUM_BNO) {
-        int                     i;      /* loop index/record number */
+                return be32_to_cpu(r1->alloc.ar_startblock) +
-        xfs_agblock_t           lbno;   /* left (current) block number */
+                        be32_to_cpu(r1->alloc.ar_blockcount) <=
-        xfs_buf_t               *lbp;   /* buffer for left block */
+                        be32_to_cpu(r2->alloc.ar_startblock);
-        xfs_alloc_block_t       *left;  /* left (current) btree block */
+        } else {
-        xfs_agblock_t           rbno;   /* right (new) block number */
+                return be32_to_cpu(r1->alloc.ar_blockcount) <
-        xfs_buf_t               *rbp;   /* buffer for right block */
+                        be32_to_cpu(r2->alloc.ar_blockcount) ||
-        xfs_alloc_block_t       *right; /* right (new) btree block */
+                        (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+                         be32_to_cpu(r1->alloc.ar_startblock) <
-        /*
+                         be32_to_cpu(r2->alloc.ar_startblock));
-         * Allocate the new block from the freelist.
-         * If we can't do it, we're toast.  Give up.
-         */
-        error = xfs_alloc_get_freelist(cur->bc_tp,
-                                         cur->bc_private.a.agbp, &rbno, 1);
-        if (error)
-                return error;
-        if (rbno == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        xfs_trans_agbtree_delta(cur->bc_tp, 1);
-        rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
-                rbno, 0);
-        /*
-         * Set up the new block as "right".
-         */
-        right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-        /*
-         * "Left" is the current (according to the cursor) block.
-         */
-        lbp = cur->bc_bufs[level];
-        left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-#endif
-        /*
-         * Fill in the btree header for the new block.
-         */
-        right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-        right->bb_level = left->bb_level;
-        right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-        /*
-         * Make sure that if there's an odd number of entries now, that
-         * each new block will have the same number of entries.
-         */
-        if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-            cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-                be16_add_cpu(&right->bb_numrecs, 1);
-        i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-        /*
-         * For non-leaf blocks, copy keys and addresses over to the new block.
-         */
-        if (level > 0) {
-                xfs_alloc_key_t *lkp;   /* left btree key pointer */
-                xfs_alloc_ptr_t *lpp;   /* left btree address pointer */
-                xfs_alloc_key_t *rkp;   /* right btree key pointer */
-                xfs_alloc_ptr_t *rpp;   /* right btree address pointer */
-                lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
-                lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
-                rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-                rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *keyp = *rkp;
        }
-        /*
+}
-         * For leaf blocks, copy records over to the new block.
+#endif  /* DEBUG */
-         */
-        else {
-                xfs_alloc_rec_t *lrp;   /* left btree record pointer */
-                xfs_alloc_rec_t *rrp;   /* right btree record pointer */
-                lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
+#ifdef XFS_BTREE_TRACE
-                rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
+ktrace_t        *xfs_allocbt_trace_buf;
-                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                keyp->ar_startblock = rrp->ar_startblock;
-                keyp->ar_blockcount = rrp->ar_blockcount;
-        }
-        /*
-         * Find the left block number by looking in the buffer.
-         * Adjust numrecs, sibling pointers.
-         */
-        lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
-        be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-        right->bb_rightsib = left->bb_rightsib;
-        left->bb_rightsib = cpu_to_be32(rbno);
-        right->bb_leftsib = cpu_to_be32(lbno);
-        xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
-        xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there's a block to the new block's right, make that block
-         * point back to right instead of to left.
-         */
-        if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-                xfs_alloc_block_t       *rrblock;       /* rr btree block */
-                xfs_buf_t               *rrbp;          /* buffer for rrblock */
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+STATIC void
-                                cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0,
+xfs_allocbt_trace_enter(
-                                &rrbp, XFS_ALLOC_BTREE_REF)))
+        struct xfs_btree_cur    *cur,
-                        return error;
+        const char              *func,
-                rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
+        char                    *s,
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+        int                     type,
-                        return error;
+        int                     line,
-                rrblock->bb_leftsib = cpu_to_be32(rbno);
+        __psunsigned_t          a0,
-                xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
+        __psunsigned_t          a1,
-        }
+        __psunsigned_t          a2,
-        /*
+        __psunsigned_t          a3,
-         * If the cursor is really in the right block, move it there.
+        __psunsigned_t          a4,
-         * If it's just pointing past the last entry in left, then we'll
+        __psunsigned_t          a5,
-         * insert there, so don't change anything in that case.
+        __psunsigned_t          a6,
-         */
+        __psunsigned_t          a7,
-        if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
+        __psunsigned_t          a8,
-                xfs_btree_setbuf(cur, level, rbp);
+        __psunsigned_t          a9,
-                cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
+        __psunsigned_t          a10)
-        }
+{
-        /*
+        ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type,
-         * If there are more levels, we'll need another cursor which refers to
+                (void *)func, (void *)s, NULL, (void *)cur,
-         * the right block, no matter where this cursor was.
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-         */
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-        if (level + 1 < cur->bc_nlevels) {
+                (void *)a8, (void *)a9, (void *)a10);
-                if ((error = xfs_btree_dup_cursor(cur, curp)))
-                        return error;
-                (*curp)->bc_ptrs[level + 1]++;
-        }
-        *bnop = rbno;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Update keys at all levels from here to the root along the cursor's path.
+xfs_allocbt_trace_cursor(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        __uint32_t              *s0,
-xfs_alloc_updkey(
+        __uint64_t              *l0,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        __uint64_t              *l1)
-        xfs_alloc_key_t         *keyp,  /* new key value to update to */
-        int                     level)  /* starting level for update */
 {
-        int                     ptr;    /* index of key in block */
+        *s0 = cur->bc_private.a.agno;
+        *l0 = cur->bc_rec.a.ar_startblock;
-        /*
+        *l1 = cur->bc_rec.a.ar_blockcount;
-         * Go up the tree from this level toward the root.
-         * At each level, update the key value to the value input.
-         * Stop when we reach a level where the cursor isn't pointing
-         * at the first entry in the block.
-         */
-        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-                xfs_alloc_block_t       *block; /* btree block */
-                xfs_buf_t               *bp;    /* buffer for block */
-#ifdef DEBUG
-                int                     error;  /* error return value */
-#endif
-                xfs_alloc_key_t         *kp;    /* ptr to btree block keys */
-                bp = cur->bc_bufs[level];
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                        return error;
-#endif
-                ptr = cur->bc_ptrs[level];
-                kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
-                *kp = *keyp;
-                xfs_alloc_log_keys(cur, bp, ptr, ptr);
-        }
-        return 0;
 }
-/*
+STATIC void
- * Externally visible routines.
+xfs_allocbt_trace_key(
- */
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
-/*
+        __uint64_t              *l0,
- * Decrement cursor by one record at the level.
+        __uint64_t              *l1)
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                     /* error */
-xfs_alloc_decrement(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level in btree, 0 is leaf */
-        int                     *stat)  /* success/failure */
 {
-        xfs_alloc_block_t       *block; /* btree block */
+        *l0 = be32_to_cpu(key->alloc.ar_startblock);
-        int                     error;  /* error return value */
+        *l1 = be32_to_cpu(key->alloc.ar_blockcount);
-        int                     lev;    /* btree level */
-        ASSERT(level < cur->bc_nlevels);
-        /*
-         * Read-ahead to the left at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-        /*
-         * Decrement the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (--cur->bc_ptrs[level] > 0) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Get a pointer to the btree block.
-         */
-        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level,
-                        cur->bc_bufs[level])))
-                return error;
-#endif
-        /*
-         * If we just went off the left edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree decrementing pointers.
-         * Stop when we don't go off the left edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                if (--cur->bc_ptrs[lev] > 0)
-                        break;
-                /*
-                 * Read-ahead the left block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                xfs_buf_t       *bp;    /* buffer pointer for block */
-                agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-        }
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Delete the record pointed to by cur.
+xfs_allocbt_trace_record(
- * The cursor refers to the place where the record was (could be inserted)
+        struct xfs_btree_cur    *cur,
- * when the operation returns.
+        union xfs_btree_rec     *rec,
- */
+        __uint64_t              *l0,
-int                                     /* error */
+        __uint64_t              *l1,
-xfs_alloc_delete(
+        __uint64_t              *l2)
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        int             *stat)          /* success/failure */
 {
-        int             error;          /* error return value */
+        *l0 = be32_to_cpu(rec->alloc.ar_startblock);
-        int             i;              /* result code */
+        *l1 = be32_to_cpu(rec->alloc.ar_blockcount);
-        int             level;          /* btree level */
+        *l2 = 0;
-        /*
-         * Go up the tree, starting at leaf level.
-         * If 2 is returned then a join was done; go to the next level.
-         * Otherwise we are done.
-         */
-        for (level = 0, i = 2; i == 2; level++) {
-                if ((error = xfs_alloc_delrec(cur, level, &i)))
-                        return error;
-        }
-        if (i == 0) {
-                for (level = 1; level < cur->bc_nlevels; level++) {
-                        if (cur->bc_ptrs[level] == 0) {
-                                if ((error = xfs_alloc_decrement(cur, level, &i)))
-                                        return error;
-                                break;
-                        }
-                }
-        }
-        *stat = i;
-        return 0;
 }
+#endif /* XFS_BTREE_TRACE */
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+        .rec_len                = sizeof(xfs_alloc_rec_t),
+        .key_len                = sizeof(xfs_alloc_key_t),
+        .dup_cursor             = xfs_allocbt_dup_cursor,
+        .set_root               = xfs_allocbt_set_root,
+        .kill_root              = xfs_allocbt_kill_root,
+        .alloc_block            = xfs_allocbt_alloc_block,
+        .free_block             = xfs_allocbt_free_block,
+        .update_lastrec         = xfs_allocbt_update_lastrec,
+        .get_minrecs            = xfs_allocbt_get_minrecs,
+        .get_maxrecs            = xfs_allocbt_get_maxrecs,
+        .init_key_from_rec      = xfs_allocbt_init_key_from_rec,
+        .init_rec_from_key      = xfs_allocbt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
+        .key_diff               = xfs_allocbt_key_diff,
-/*
- * Get the data from the pointed-to record.
- */
-int                                     /* error */
-xfs_alloc_get_rec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_agblock_t           *bno,   /* output: starting block of extent */
-        xfs_extlen_t            *len,   /* output: length of extent */
-        int                     *stat)  /* output: success/failure */
-{
-        xfs_alloc_block_t       *block; /* btree block */
 #ifdef DEBUG
-        int                     error;  /* error return value */
+        .keys_inorder           = xfs_allocbt_keys_inorder,
+        .recs_inorder           = xfs_allocbt_recs_inorder,
 #endif
-        int                     ptr;    /* record number */
-        ptr = cur->bc_ptrs[0];
+#ifdef XFS_BTREE_TRACE
-        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
+        .trace_enter            = xfs_allocbt_trace_enter,
-#ifdef DEBUG
+        .trace_cursor           = xfs_allocbt_trace_cursor,
-        if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
+        .trace_key              = xfs_allocbt_trace_key,
-                return error;
+        .trace_record           = xfs_allocbt_trace_record,
 #endif
-        /*
+};
-         * Off the right end or left end, return failure.
-         */
-        if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Point to the record and extract its data.
-         */
-        {
-                xfs_alloc_rec_t         *rec;   /* record data */
-                rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                *bno = be32_to_cpu(rec->ar_startblock);
-                *len = be32_to_cpu(rec->ar_blockcount);
-        }
-        *stat = 1;
-        return 0;
-}
 /*
- * Increment cursor by one record at the level.
+ * Allocate a new allocation btree cursor.
- * For nonzero levels the leaf-ward information is untouched.
 */
-int                                     /* error */
+struct xfs_btree_cur *                  /* new alloc btree cursor */
-xfs_alloc_increment(
+xfs_allocbt_init_cursor(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_mount        *mp,            /* file system mount point */
-        int                     level,  /* level in btree, 0 is leaf */
+        struct xfs_trans        *tp,            /* transaction pointer */
-        int                     *stat)  /* success/failure */
+        struct xfs_buf          *agbp,          /* buffer for agf structure */
+        xfs_agnumber_t          agno,           /* allocation group number */
+        xfs_btnum_t             btnum)          /* btree identifier */
 {
-        xfs_alloc_block_t       *block; /* btree block */
+        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
-        xfs_buf_t               *bp;    /* tree block buffer */
+        struct xfs_btree_cur    *cur;
-        int                     error;  /* error return value */
-        int                     lev;    /* btree level */
-        ASSERT(level < cur->bc_nlevels);
-        /*
-         * Read-ahead to the right at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-        /*
-         * Get a pointer to the btree block.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-#endif
-        /*
-         * Increment the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we just went off the right edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree incrementing pointers.
-         * Stop when we don't go off the right edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                bp = cur->bc_bufs[lev];
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-#endif
-                if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                        break;
-                /*
-                 * Read-ahead the right block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-             lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
+        ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
-                                XFS_ALLOC_BTREE_REF)))
-                        return error;
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = 1;
-        }
-        *stat = 1;
-        return 0;
-}
-/*
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-int                                     /* error */
-xfs_alloc_insert(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        int             *stat)          /* success/failure */
-{
-        int             error;          /* error return value */
-        int             i;              /* result value, 0 for failure */
-        int             level;          /* current level number in btree */
-        xfs_agblock_t   nbno;           /* new block number (split result) */
-        xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-        xfs_alloc_rec_t nrec;           /* record being inserted this level */
-        xfs_btree_cur_t *pcur;          /* previous level's cursor */
-        level = 0;
+        cur->bc_tp = tp;
-        nbno = NULLAGBLOCK;
+        cur->bc_mp = mp;
-        nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+        cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]);
-        nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+        cur->bc_btnum = btnum;
-        ncur = NULL;
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        pcur = cur;
-        /*
-         * Loop going up the tree, starting at the leaf level.
-         * Stop when we don't get a split block, that must mean that
-         * the insert is finished with this level.
-         */
-        do {
-                /*
-                 * Insert nrec/nbno into this level of the tree.
-                 * Note if we fail, nbno will be null.
-                 */
-                if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                                &i))) {
-                        if (pcur != cur)
-                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        return error;
-                }
-                /*
-                 * See if the cursor we just used is trash.
-                 * Can't trash the caller's cursor, but otherwise we should
-                 * if ncur is a new cursor or we're about to be done.
-                 */
-                if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                        cur->bc_nlevels = pcur->bc_nlevels;
-                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-                }
-                /*
-                 * If we got a new cursor, switch to it.
-                 */
-                if (ncur) {
-                        pcur = ncur;
-                        ncur = NULL;
-                }
-        } while (nbno != NULLAGBLOCK);
-        *stat = i;
-        return 0;
-}
-/*
+        cur->bc_ops = &xfs_allocbt_ops;
- * Lookup the record equal to [bno, len] in the btree given by cur.
+        if (btnum == XFS_BTNUM_CNT)
- */
+                cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
-int                                     /* error */
-xfs_alloc_lookup_eq(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agblock_t   bno,            /* starting block of extent */
-        xfs_extlen_t    len,            /* length of extent */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.a.ar_startblock = bno;
-        cur->bc_rec.a.ar_blockcount = len;
-        return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-/*
+        cur->bc_private.a.agbp = agbp;
- * Lookup the first record greater than or equal to [bno, len]
+        cur->bc_private.a.agno = agno;
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_alloc_lookup_ge(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agblock_t   bno,            /* starting block of extent */
-        xfs_extlen_t    len,            /* length of extent */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.a.ar_startblock = bno;
-        cur->bc_rec.a.ar_blockcount = len;
-        return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-/*
+        return cur;
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_alloc_lookup_le(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agblock_t   bno,            /* starting block of extent */
-        xfs_extlen_t    len,            /* length of extent */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.a.ar_startblock = bno;
-        cur->bc_rec.a.ar_blockcount = len;
-        return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
 }
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
+ * Calculate number of records in an alloc btree block.
- * This either works (return 0) or gets an EFSCORRUPTED error.
 */
-int                                     /* error */
+int
-xfs_alloc_update(
+xfs_allocbt_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_mount        *mp,
-        xfs_agblock_t           bno,    /* starting block of extent */
+        int                     blocklen,
-        xfs_extlen_t            len)    /* length of extent */
+        int                     leaf)
 {
-        xfs_alloc_block_t       *block; /* btree block to update */
+        blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
-        int                     error;  /* error return value */
-        int                     ptr;    /* current record number (updating) */
-        ASSERT(len > 0);
+        if (leaf)
-        /*
+                return blocklen / sizeof(xfs_alloc_rec_t);
-         * Pick up the a.g. freelist struct and the current block.
+        return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
-         */
-        block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
-                return error;
-#endif
-        /*
-         * Get the address of the rec to be updated.
-         */
-        ptr = cur->bc_ptrs[0];
-        {
-                xfs_alloc_rec_t         *rp;    /* pointer to updated record */
-                rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                /*
-                 * Fill in the new contents and log them.
-                 */
-                rp->ar_startblock = cpu_to_be32(bno);
-                rp->ar_blockcount = cpu_to_be32(len);
-                xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
-        }
-        /*
-         * If it's the by-size btree and it's the last leaf block and
-         * it's the last record... then update the size of the longest
-         * extent in the a.g., which we cache in the a.g. freelist header.
-         */
-        if (cur->bc_btnum == XFS_BTNUM_CNT &&
-            be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-            ptr == be16_to_cpu(block->bb_numrecs)) {
-                xfs_agf_t       *agf;   /* a.g. freespace header */
-                xfs_agnumber_t  seqno;
-                agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-                seqno = be32_to_cpu(agf->agf_seqno);
-                cur->bc_mp->m_perag[seqno].pagf_longest = len;
-                agf->agf_longest = cpu_to_be32(len);
-                xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                        XFS_AGF_LONGEST);
-        }
-        /*
-         * Updating first record in leaf. Pass new key value up to our parent.
-         */
-        if (ptr == 1) {
-                xfs_alloc_key_t key;    /* key containing [bno, len] */
-                key.ar_startblock = cpu_to_be32(bno);
-                key.ar_blockcount = cpu_to_be32(len);
-                if ((error = xfs_alloc_updkey(cur, &key, 1)))
-                        return error;
-        }
-        return 0;
 }
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 5bd1a2c8bd07..a6caa0022c9b 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -24,7 +24,6 @@
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 /*
@@ -50,16 +49,6 @@ typedef struct xfs_alloc_rec_incore {
 /* btree pointer type */
 typedef __be32 xfs_alloc_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_alloc_block_t;
-#define XFS_BUF_TO_ALLOC_BLOCK(bp)      ((xfs_alloc_block_t *)XFS_BUF_PTR(bp))
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0])
-#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0])
 /*
 * Minimum and maximum blocksize and sectorsize.
@@ -83,73 +72,39 @@ typedef	struct xfs_btree_sblock xfs_alloc_block_t;
 #define XFS_CNT_BLOCK(mp)       ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
 /*
- * Record, key, and pointer address macros for btree blocks.
+ * Btree block header size depends on a superblock flag.
- */
+ *
-#define XFS_ALLOC_REC_ADDR(bb,i,cur)    \
+ * (not quite yet, but soon)
-        XFS_BTREE_REC_ADDR(xfs_alloc, bb, i)
-#define XFS_ALLOC_KEY_ADDR(bb,i,cur)    \
-        XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i)
-#define XFS_ALLOC_PTR_ADDR(bb,i,cur)    \
-        XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat);
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur, xfs_agblock_t *bno,
-                                xfs_extlen_t *len, int *stat);
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat);
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                                xfs_extlen_t len, int *stat);
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                                xfs_extlen_t len, int *stat);
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
 */
-extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
-                                xfs_extlen_t len, int *stat);
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
+ * Record, key, and pointer address macros for btree blocks.
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ *
- */
+ * (note that some of these may appear unused, but they are used in userspace)
-extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ */
-                                xfs_extlen_t len);
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+        ((xfs_alloc_rec_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+        ((xfs_alloc_key_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_alloc_key_t)))
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_alloc_ptr_t *) \
+                ((char *)(block) + \
+                 XFS_ALLOC_BLOCK_LEN(mp) + \
+                 (maxrecs) * sizeof(xfs_alloc_key_t) + \
+                 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_buf *,
+                xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 #endif  /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 0b3b5efe848c..53d5e70d1360 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -41,21 +41,36 @@
 #endif
 #ifdef XFS_NATIVE_HOST
-#define cpu_to_be16(val)        ((__be16)(val))
+#define cpu_to_be16(val)        ((__force __be16)(__u16)(val))
-#define cpu_to_be32(val)        ((__be32)(val))
+#define cpu_to_be32(val)        ((__force __be32)(__u32)(val))
-#define cpu_to_be64(val)        ((__be64)(val))
+#define cpu_to_be64(val)        ((__force __be64)(__u64)(val))
-#define be16_to_cpu(val)        ((__uint16_t)(val))
+#define be16_to_cpu(val)        ((__force __u16)(__be16)(val))
-#define be32_to_cpu(val)        ((__uint32_t)(val))
+#define be32_to_cpu(val)        ((__force __u32)(__be32)(val))
-#define be64_to_cpu(val)        ((__uint64_t)(val))
+#define be64_to_cpu(val)        ((__force __u64)(__be64)(val))
 #else
-#define cpu_to_be16(val)        (__swab16((__uint16_t)(val)))
+#define cpu_to_be16(val)        ((__force __be16)__swab16((__u16)(val)))
-#define cpu_to_be32(val)        (__swab32((__uint32_t)(val)))
+#define cpu_to_be32(val)        ((__force __be32)__swab32((__u32)(val)))
-#define cpu_to_be64(val)        (__swab64((__uint64_t)(val)))
+#define cpu_to_be64(val)        ((__force __be64)__swab64((__u64)(val)))
-#define be16_to_cpu(val)        (__swab16((__be16)(val)))
+#define be16_to_cpu(val)        (__swab16((__force __u16)(__be16)(val)))
-#define be32_to_cpu(val)        (__swab32((__be32)(val)))
+#define be32_to_cpu(val)        (__swab32((__force __u32)(__be32)(val)))
-#define be64_to_cpu(val)        (__swab64((__be64)(val)))
+#define be64_to_cpu(val)        (__swab64((__force __u64)(__be64)(val)))
 #endif
+static inline void be16_add_cpu(__be16 *a, __s16 b)
+{
+        *a = cpu_to_be16(be16_to_cpu(*a) + b);
+}
+static inline void be32_add_cpu(__be32 *a, __s32 b)
+{
+        *a = cpu_to_be32(be32_to_cpu(*a) + b);
+}
+static inline void be64_add_cpu(__be64 *a, __s64 b)
+{
+        *a = cpu_to_be64(be64_to_cpu(*a) + b);
+}
 #endif  /* __KERNEL__ */
 /* do we need conversion? */
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 8e0e463dae2d..bca7b243c319 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -61,8 +61,7 @@ static inline int xfs_highbit64(__uint64_t v)
 /* Get low bit set out of 32-bit argument, -1 if none set */
 static inline int xfs_lowbit32(__uint32_t v)
 {
-        unsigned long   t = v;
+        return ffs(v) - 1;
-        return (v) ? find_first_bit(&t, 32) : -1;
 }
 /* Get low bit set out of 64-bit argument, -1 if none set */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index a1aab9275d5a..138308e70d14 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -393,8 +393,8 @@ xfs_bmap_count_leaves(
 STATIC void
 xfs_bmap_disk_count_leaves(
-        xfs_extnum_t            idx,
+        struct xfs_mount        *mp,
-        xfs_bmbt_block_t        *block,
+        struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count);
@@ -402,6 +402,53 @@ xfs_bmap_disk_count_leaves(
 * Bmap internal routines.
 */
+STATIC int                              /* error */
+xfs_bmbt_lookup_eq(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+STATIC int                              /* error */
+xfs_bmbt_lookup_ge(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.b.br_startoff = off;
+        cur->bc_rec.b.br_startblock = bno;
+        cur->bc_rec.b.br_blockcount = len;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+* Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+        struct xfs_btree_cur    *cur,
+        xfs_fileoff_t           off,
+        xfs_fsblock_t           bno,
+        xfs_filblks_t           len,
+        xfs_exntst_t            state)
+{
+        union xfs_btree_rec     rec;
+        xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+        return xfs_btree_update(cur, &rec);
+}
 /*
 * Called from xfs_bmap_add_attrfork to handle btree format files.
 */
@@ -422,15 +469,14 @@ xfs_bmap_add_attrfork_btree(
        if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
                *flags |= XFS_ILOG_DBROOT;
        else {
-                cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
-                        XFS_DATA_FORK);
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.firstblock = *firstblock;
                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                        goto error0;
                /* must be at least one entry */
                XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
-                if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+                if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
                        goto error0;
                if (stat == 0) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -818,10 +864,10 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -931,7 +977,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1007,7 +1053,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1097,7 +1143,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1152,7 +1198,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1379,16 +1425,16 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1428,10 +1474,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1471,10 +1517,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
@@ -1557,7 +1603,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        if (xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
@@ -1605,7 +1651,7 @@ xfs_bmap_add_extent_unwritten_real(
                                oldext)))
                                goto done;
                        cur->bc_rec.b = *new;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1647,7 +1693,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                        if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto done;
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock,
@@ -1695,7 +1741,7 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1743,7 +1789,7 @@ xfs_bmap_add_extent_unwritten_real(
                        cur->bc_rec.b = PREV;
                        cur->bc_rec.b.br_blockcount =
                                new->br_startoff - PREV.br_startoff;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        /*
@@ -1758,7 +1804,7 @@ xfs_bmap_add_extent_unwritten_real(
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        /* new middle extent - newext */
                        cur->bc_rec.b.br_state = new->br_state;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2106,10 +2152,10 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_delete(cur, &i)))
+                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                        if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
@@ -2218,7 +2264,7 @@ xfs_bmap_add_extent_hole_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = new->br_state;
-                        if ((error = xfs_bmbt_insert(cur, &i)))
+                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2996,24 +3042,24 @@ xfs_bmap_btree_to_extents(
        int                     whichfork)  /* data or attr fork */
 {
        /* REFERENCED */
-        xfs_bmbt_block_t        *cblock;/* child btree block */
+        struct xfs_btree_block  *cblock;/* child btree block */
        xfs_fsblock_t           cbno;   /* child block number */
        xfs_buf_t               *cbp;   /* child block's buffer */
        int                     error;  /* error return value */
        xfs_ifork_t             *ifp;   /* inode fork data */
        xfs_mount_t             *mp;    /* mount point structure */
        __be64                  *pp;    /* ptr to block address */
-        xfs_bmbt_block_t        *rblock;/* root btree block */
+        struct xfs_btree_block  *rblock;/* root btree block */
+        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
        rblock = ifp->if_broot;
        ASSERT(be16_to_cpu(rblock->bb_level) == 1);
        ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-        ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
+        ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
-        mp = ip->i_mount;
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
        cbno = be64_to_cpu(*pp);
        *logflagsp = 0;
 #ifdef DEBUG
@@ -3023,8 +3069,8 @@ xfs_bmap_btree_to_extents(
        if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
                        XFS_BMAP_BTREE_REF)))
                return error;
-        cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
+        cblock = XFS_BUF_TO_BLOCK(cbp);
-        if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+        if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
                return error;
        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
        ip->i_d.di_nblocks--;
@@ -3170,7 +3216,7 @@ xfs_bmap_del_extent(
                        flags |= XFS_ILOG_FEXT(whichfork);
                        break;
                }
-                if ((error = xfs_bmbt_delete(cur, &i)))
+                if ((error = xfs_btree_delete(cur, &i)))
                        goto done;
                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                break;
@@ -3254,10 +3300,10 @@ xfs_bmap_del_extent(
                                                got.br_startblock, temp,
                                                got.br_state)))
                                        goto done;
-                                if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto done;
                                cur->bc_rec.b = new;
-                                error = xfs_bmbt_insert(cur, &i);
+                                error = xfs_btree_insert(cur, &i);
                                if (error && error != ENOSPC)
                                        goto done;
                                /*
@@ -3404,11 +3450,11 @@ xfs_bmap_extents_to_btree(
        int                     *logflagsp,     /* inode logging flags */
        int                     whichfork)      /* data or attr fork */
 {
-        xfs_bmbt_block_t        *ablock;        /* allocated (child) bt block */
+        struct xfs_btree_block  *ablock;        /* allocated (child) bt block */
        xfs_buf_t               *abp;           /* buffer for ablock */
        xfs_alloc_arg_t         args;           /* allocation arguments */
        xfs_bmbt_rec_t          *arp;           /* child record pointer */
-        xfs_bmbt_block_t        *block;         /* btree root block */
+        struct xfs_btree_block  *block;         /* btree root block */
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
        xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
        int                     error;          /* error return value */
@@ -3428,6 +3474,7 @@ xfs_bmap_extents_to_btree(
         */
        xfs_iroot_realloc(ip, 1, whichfork);
        ifp->if_flags |= XFS_IFBROOT;
        /*
         * Fill in the root.
         */
@@ -3435,14 +3482,14 @@ xfs_bmap_extents_to_btree(
        block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        block->bb_level = cpu_to_be16(1);
        block->bb_numrecs = cpu_to_be16(1);
-        block->bb_leftsib = cpu_to_be64(NULLDFSBNO);
+        block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-        block->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+        block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
        /*
         * Need a cursor.  Can't allocate until bb_level is filled in.
         */
        mp = ip->i_mount;
-        cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+        cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-                whichfork);
        cur->bc_private.b.firstblock = *firstblock;
        cur->bc_private.b.flist = flist;
        cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
@@ -3489,12 +3536,12 @@ xfs_bmap_extents_to_btree(
        /*
         * Fill in the child block.
         */
-        ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
+        ablock = XFS_BUF_TO_BLOCK(abp);
        ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        ablock->bb_level = 0;
-        ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
+        ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-        ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+        ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-        arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+        arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        for (cnt = i = 0; i < nextents; i++) {
                ep = xfs_iext_get_ext(ifp, i);
@@ -3505,21 +3552,24 @@ xfs_bmap_extents_to_btree(
                }
        }
        ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
-        ablock->bb_numrecs = cpu_to_be16(cnt);
+        xfs_btree_set_numrecs(ablock, cnt);
        /*
         * Fill in the root key and pointer.
         */
-        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
+        kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
-        arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+        arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+        pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+                                                be16_to_cpu(block->bb_level)));
        *pp = cpu_to_be64(args.fsbno);
        /*
         * Do all this logging at the end so that
         * the root is at the right level.
         */
-        xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
+        xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
-        xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+        xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
        ASSERT(*curp == NULL);
        *curp = cur;
        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
@@ -4176,7 +4226,7 @@ xfs_bmap_compute_maxlevels(
                maxleafents = MAXAEXTNUM;
                sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
        }
-        maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
+        maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
        minleafrecs = mp->m_bmap_dmnr[0];
        minnoderecs = mp->m_bmap_dmnr[1];
        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -4242,9 +4292,15 @@ xfs_bmap_finish(
         * We have a new transaction, so we should return committed=1,
         * even though we're returning an error.
         */
-        if (error) {
+        if (error)
                return error;
-        }
+        /*
+         * transaction commit worked ok so we can drop the extra ticket
+         * reference that we gained in xfs_trans_dup()
+         */
+        xfs_log_ticket_put(ntp->t_ticket);
        if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
                        logcount)))
                return error;
@@ -4474,6 +4530,22 @@ xfs_bmap_one_block(
        return rval;
 }
+STATIC int
+xfs_bmap_sanity_check(
+        struct xfs_mount        *mp,
+        struct xfs_buf          *bp,
+        int                     level)
+{
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC ||
+            be16_to_cpu(block->bb_level) != level ||
+            be16_to_cpu(block->bb_numrecs) == 0 ||
+            be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+                return 0;
+        return 1;
+}
 /*
 * Read in the extents to if_extents.
 * All inode fields are set up by caller, we just traverse the btree
@@ -4486,7 +4558,7 @@ xfs_bmap_read_extents(
        xfs_inode_t             *ip,    /* incore inode */
        int                     whichfork) /* data or attr fork */
 {
-        xfs_bmbt_block_t        *block; /* current btree block */
+        struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -4510,7 +4582,7 @@ xfs_bmap_read_extents(
         */
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -4523,13 +4595,13 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                        XFS_BMAP_SANITY_CHECK(mp, block, level),
+                        xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
-                pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                xfs_trans_brelse(tp, bp);
@@ -4549,7 +4621,7 @@ xfs_bmap_read_extents(
                xfs_extnum_t    start;
-                num_recs = be16_to_cpu(block->bb_numrecs);
+                num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
@@ -4561,18 +4633,18 @@ xfs_bmap_read_extents(
                        goto error0;
                }
                XFS_WANT_CORRUPTED_GOTO(
-                        XFS_BMAP_SANITY_CHECK(mp, block, 0),
+                        xfs_bmap_sanity_check(mp, bp, 0),
                        error0);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
-                nextbno = be64_to_cpu(block->bb_rightsib);
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                if (nextbno != NULLFSBLOCK)
                        xfs_btree_reada_bufl(mp, nextbno, 1);
                /*
                 * Copy records into the extent records.
                 */
-                frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+                frp = XFS_BMBT_REC_ADDR(mp, block, 1);
                start = i;
                for (j = 0; j < num_recs; j++, i++, frp++) {
                        xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
@@ -4603,7 +4675,7 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
        }
        ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
        ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
@@ -5029,8 +5101,7 @@ xfs_bmapi(
                                if (abno == NULLFSBLOCK)
                                        break;
                                if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                        cur = xfs_btree_init_cursor(mp,
+                                        cur = xfs_bmbt_init_cursor(mp, tp,
-                                                tp, NULL, 0, XFS_BTNUM_BMAP,
                                                ip, whichfork);
                                        cur->bc_private.b.firstblock =
                                                *firstblock;
@@ -5147,9 +5218,8 @@ xfs_bmapi(
                         */
                        ASSERT(mval->br_blockcount <= len);
                        if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                cur = xfs_btree_init_cursor(mp,
+                                cur = xfs_bmbt_init_cursor(mp,
-                                        tp, NULL, 0, XFS_BTNUM_BMAP,
+                                        tp, ip, whichfork);
-                                        ip, whichfork);
                                cur->bc_private.b.firstblock =
                                        *firstblock;
                                cur->bc_private.b.flist = flist;
@@ -5440,8 +5510,7 @@ xfs_bunmapi(
        logflags = 0;
        if (ifp->if_flags & XFS_IFBROOT) {
                ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-                cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-                        whichfork);
                cur->bc_private.b.firstblock = *firstblock;
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.flags = 0;
@@ -5742,14 +5811,17 @@ error0:
 STATIC int
 xfs_getbmapx_fix_eof_hole(
        xfs_inode_t             *ip,            /* xfs incore inode pointer */
-        struct getbmap          *out,           /* output structure */
+        struct getbmapx         *out,           /* output structure */
        int                     prealloced,     /* this is a file with
-                                                * preallocated data space */
+                                                 * preallocated data space */
        __int64_t               end,            /* last block requested */
        xfs_fsblock_t           startblock)
 {
        __int64_t               fixlen;
        xfs_mount_t             *mp;            /* file system mount point */
+        xfs_ifork_t             *ifp;           /* inode fork pointer */
+        xfs_extnum_t            lastx;          /* last extent pointer */
+        xfs_fileoff_t           fileblock;
        if (startblock == HOLESTARTBLOCK) {
                mp = ip->i_mount;
@@ -5763,21 +5835,33 @@ xfs_getbmapx_fix_eof_hole(
                        out->bmv_length = fixlen;
                }
        } else {
-                out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+                if (startblock == DELAYSTARTBLOCK)
+                        out->bmv_block = -2;
+                else
+                        out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+                fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+                ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+                if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+                   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+                        out->bmv_oflags |= BMV_OF_LAST;
        }
        return 1;
 }
 /*
- * Fcntl interface to xfs_bmapi.
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
 */
 int                                             /* error code */
 xfs_getbmap(
        xfs_inode_t             *ip,
-        struct getbmap          *bmv,           /* user bmap structure */
+        struct getbmapx         *bmv,           /* user bmap structure */
-        void                    __user *ap,     /* pointer to user's array */
+        xfs_bmap_format_t       formatter,      /* format to user */
-        int                     interface)      /* interface flags */
+        void                    *arg)           /* formatter arg */
 {
        __int64_t               bmvend;         /* last block requested */
        int                     error;          /* return value */
@@ -5790,19 +5874,17 @@ xfs_getbmap(
        int                     nexleft;        /* # of user extents left */
        int                     subnex;         /* # of bmapi's can do */
        int                     nmap;           /* number of map entries */
-        struct getbmap          out;            /* output structure */
+        struct getbmapx         out;            /* output structure */
        int                     whichfork;      /* data or attr fork */
        int                     prealloced;     /* this is a file with
                                                 * preallocated data space */
-        int                     sh_unwritten;   /* true, if unwritten */
+        int                     iflags;         /* interface flags */
-                                                /* extents listed separately */
        int                     bmapi_flags;    /* flags for xfs_bmapi */
-        __int32_t               oflags;         /* getbmapx bmv_oflags field */
        mp = ip->i_mount;
+        iflags = bmv->bmv_iflags;
-        whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
+        whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
-        sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
        /*      If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
         *      generate a DMAPI read event.  Otherwise, if the DM_EVENT_READ
@@ -5817,7 +5899,7 @@ xfs_getbmap(
         *      could misinterpret holes in a DMAPI file as true holes,
         *      when in fact they may represent offline user data.
         */
-        if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
+        if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
            DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
            whichfork == XFS_DATA_FORK) {
                error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
@@ -5873,8 +5955,9 @@ xfs_getbmap(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        if (whichfork == XFS_DATA_FORK &&
+        if (((iflags & BMV_IF_DELALLOC) == 0) &&
-                (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
+            (whichfork == XFS_DATA_FORK) &&
+            (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
                /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
                error = xfs_flush_pages(ip, (xfs_off_t)0,
                                               -1, 0, FI_REMAPF);
@@ -5884,7 +5967,8 @@ xfs_getbmap(
                }
        }
-        ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
+        ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) ||
+               ip->i_delayed_blks == 0);
        lock = xfs_ilock_map_shared(ip);
@@ -5896,7 +5980,7 @@ xfs_getbmap(
                nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
        bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
-                        ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
+                        ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
        /*
         * Allocate enough space to handle "subnex" maps at a time.
@@ -5906,9 +5990,12 @@ xfs_getbmap(
        bmv->bmv_entries = 0;
-        if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
+        if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) {
-                error = 0;
+                if (((iflags & BMV_IF_DELALLOC) == 0) ||
-                goto unlock_and_return;
+                    whichfork == XFS_ATTR_FORK) {
+                        error = 0;
+                        goto unlock_and_return;
+                }
        }
        nexleft = nex;
@@ -5924,52 +6011,40 @@ xfs_getbmap(
                ASSERT(nmap <= subnex);
                for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
-                        nexleft--;
+                        out.bmv_oflags = 0;
-                        oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ?
+                        if (map[i].br_state == XFS_EXT_UNWRITTEN)
-                                        BMV_OF_PREALLOC : 0;
+                                out.bmv_oflags |= BMV_OF_PREALLOC;
+                        else if (map[i].br_startblock == DELAYSTARTBLOCK)
+                                out.bmv_oflags |= BMV_OF_DELALLOC;
                        out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
                        out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
-                        ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+                        out.bmv_unused1 = out.bmv_unused2 = 0;
+                        ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
+                              (map[i].br_startblock != DELAYSTARTBLOCK));
                        if (map[i].br_startblock == HOLESTARTBLOCK &&
                            whichfork == XFS_ATTR_FORK) {
                                /* came to the end of attribute fork */
+                                out.bmv_oflags |= BMV_OF_LAST;
                                goto unlock_and_return;
                        } else {
+                                int full = 0;   /* user array is full */
                                if (!xfs_getbmapx_fix_eof_hole(ip, &out,
                                                        prealloced, bmvend,
                                                        map[i].br_startblock)) {
                                        goto unlock_and_return;
                                }
-                                /* return either getbmap/getbmapx structure. */
+                                /* format results & advance arg */
-                                if (interface & BMV_IF_EXTENDED) {
+                                error = formatter(&arg, &out, &full);
-                                        struct  getbmapx        outx;
+                                if (error || full)
+                                        goto unlock_and_return;
-                                        GETBMAP_CONVERT(out,outx);
+                                nexleft--;
-                                        outx.bmv_oflags = oflags;
-                                        outx.bmv_unused1 = outx.bmv_unused2 = 0;
-                                        if (copy_to_user(ap, &outx,
-                                                        sizeof(outx))) {
-                                                error = XFS_ERROR(EFAULT);
-                                                goto unlock_and_return;
-                                        }
-                                } else {
-                                        if (copy_to_user(ap, &out,
-                                                        sizeof(out))) {
-                                                error = XFS_ERROR(EFAULT);
-                                                goto unlock_and_return;
-                                        }
-                                }
                                bmv->bmv_offset =
                                        out.bmv_offset + out.bmv_length;
                                bmv->bmv_length = MAX((__int64_t)0,
                                        (__int64_t)(bmvend - bmv->bmv_offset));
                                bmv->bmv_entries++;
-                                ap = (interface & BMV_IF_EXTENDED) ?
-                                                (void __user *)
-                                        ((struct getbmapx __user *)ap + 1) :
-                                                (void __user *)
-                                        ((struct getbmap __user *)ap + 1);
                        }
                }
        } while (nmap && nexleft && bmv->bmv_length);
@@ -6131,7 +6206,7 @@ xfs_bmap_get_bp(
 void
 xfs_check_block(
-        xfs_bmbt_block_t        *block,
+        struct xfs_btree_block  *block,
        xfs_mount_t             *mp,
        int                     root,
        short                   sz)
@@ -6143,36 +6218,29 @@ xfs_check_block(
        ASSERT(be16_to_cpu(block->bb_level) > 0);
        prevp = NULL;
-        for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) {
+        for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
                dmxr = mp->m_bmap_dmxr[0];
+                keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
-                if (root) {
-                        keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
-                } else {
-                        keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i);
-                }
                if (prevp) {
-                        xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
+                        ASSERT(be64_to_cpu(prevp->br_startoff) <
+                               be64_to_cpu(keyp->br_startoff));
                }
                prevp = keyp;
                /*
                 * Compare the block numbers to see if there are dups.
                 */
+                if (root)
+                        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+                else
+                        pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
-                if (root) {
-                        pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
-                } else {
-                        pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr);
-                }
                for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
-                        if (root) {
+                        if (root)
-                                thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
+                                thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
-                        } else {
+                        else
-                                thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j,
+                                thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
-                                                            dmxr);
-                        }
                        if (*thispa == *pp) {
                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
                                        __func__, j, i,
@@ -6195,7 +6263,7 @@ xfs_bmap_check_leaf_extents(
        xfs_inode_t             *ip,            /* incore inode pointer */
        int                     whichfork)      /* data or attr fork */
 {
-        xfs_bmbt_block_t        *block; /* current btree block */
+        struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -6223,7 +6291,7 @@ xfs_bmap_check_leaf_extents(
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
        xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
@@ -6245,9 +6313,9 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                        XFS_BMAP_SANITY_CHECK(mp, block, level),
+                        xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
@@ -6258,7 +6326,7 @@ xfs_bmap_check_leaf_extents(
                 */
                xfs_check_block(block, mp, 0, 0);
-                pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                if (bp_release) {
@@ -6280,13 +6348,13 @@ xfs_bmap_check_leaf_extents(
                xfs_extnum_t    num_recs;
-                num_recs = be16_to_cpu(block->bb_numrecs);
+                num_recs = xfs_btree_get_numrecs(block);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
-                nextbno = be64_to_cpu(block->bb_rightsib);
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                /*
                 * Check all the extents to make sure they are OK.
@@ -6294,13 +6362,17 @@ xfs_bmap_check_leaf_extents(
                 * conform with the first entry in this one.
                 */
-                ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+                ep = XFS_BMBT_REC_ADDR(mp, block, 1);
                if (i) {
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+                        ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+                               xfs_bmbt_disk_get_blockcount(&last) <=
+                               xfs_bmbt_disk_get_startoff(ep));
                }
                for (j = 1; j < num_recs; j++) {
-                        nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
+                        nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
+                        ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+                               xfs_bmbt_disk_get_blockcount(ep) <=
+                               xfs_bmbt_disk_get_startoff(nextp));
                        ep = nextp;
                }
@@ -6326,7 +6398,7 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
        }
        if (bp_release) {
                bp_release = 0;
@@ -6356,7 +6428,7 @@ xfs_bmap_count_blocks(
        int                     whichfork,      /* data or attr fork */
        int                     *count)         /* out: count of blocks */
 {
-        xfs_bmbt_block_t        *block; /* current btree block */
+        struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_ifork_t             *ifp;   /* fork structure */
        int                     level;  /* btree level, for checking */
@@ -6379,7 +6451,7 @@ xfs_bmap_count_blocks(
        block = ifp->if_broot;
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-        pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+        pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -6413,29 +6485,29 @@ xfs_bmap_count_tree(
        __be64                  *pp;
        xfs_fsblock_t           bno = blockno;
        xfs_fsblock_t           nextbno;
-        xfs_bmbt_block_t        *block, *nextblock;
+        struct xfs_btree_block  *block, *nextblock;
        int                     numrecs;
        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
                return error;
        *count += 1;
-        block = XFS_BUF_TO_BMBT_BLOCK(bp);
+        block = XFS_BUF_TO_BLOCK(bp);
        if (--level) {
                /* Not at node above leafs, count this level of nodes */
-                nextbno = be64_to_cpu(block->bb_rightsib);
+                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                while (nextbno != NULLFSBLOCK) {
                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
                                0, &nbp, XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                        nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
+                        nextblock = XFS_BUF_TO_BLOCK(nbp);
-                        nextbno = be64_to_cpu(nextblock->bb_rightsib);
+                        nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
                        xfs_trans_brelse(tp, nbp);
                }
                /* Dive to the next level */
-                pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                if (unlikely((error =
                     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
@@ -6448,9 +6520,9 @@ xfs_bmap_count_tree(
        } else {
                /* count all level 1 nodes and their leaves */
                for (;;) {
-                        nextbno = be64_to_cpu(block->bb_rightsib);
+                        nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                        numrecs = be16_to_cpu(block->bb_numrecs);
-                        xfs_bmap_disk_count_leaves(0, block, numrecs, count);
+                        xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
                        xfs_trans_brelse(tp, bp);
                        if (nextbno == NULLFSBLOCK)
                                break;
@@ -6459,7 +6531,7 @@ xfs_bmap_count_tree(
                                XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                        block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                        block = XFS_BUF_TO_BLOCK(bp);
                }
        }
        return 0;
@@ -6489,8 +6561,8 @@ xfs_bmap_count_leaves(
 */
 STATIC void
 xfs_bmap_disk_count_leaves(
-        xfs_extnum_t            idx,
+        struct xfs_mount        *mp,
-        xfs_bmbt_block_t        *block,
+        struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count)
 {
@@ -6498,7 +6570,7 @@ xfs_bmap_disk_count_leaves(
        xfs_bmbt_rec_t  *frp;
        for (b = 1; b <= numrecs; b++) {
-                frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
+                frp = XFS_BMBT_REC_ADDR(mp, block, b);
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 9f3e3a836d15..284571c05ed0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -137,9 +137,7 @@ typedef struct xfs_bmalloca {
        char                    conv;   /* overwriting unwritten extents */
 } xfs_bmalloca_t;
-#ifdef __KERNEL__
+#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
-#if defined(XFS_BMAP_TRACE)
 /*
 * Trace operations for bmap extent tracing
 */
@@ -163,9 +161,12 @@ xfs_bmap_trace_exlist(
        int                     whichfork);     /* data or attr fork */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
        xfs_bmap_trace_exlist(__func__,ip,c,w)
-#else
+#else   /* __KERNEL__ && XFS_BMAP_TRACE */
 #define XFS_BMAP_TRACE_EXLIST(ip,c,w)
-#endif
+#endif  /* __KERNEL__ && XFS_BMAP_TRACE */
 /*
 * Convert inode from non-attributed to attributed.
@@ -206,20 +207,6 @@ xfs_bmap_compute_maxlevels(
        int                     whichfork);     /* data or attr fork */
 /*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int                                             /* error */
-xfs_bmap_finish(
-        struct xfs_trans        **tp,           /* transaction pointer addr */
-        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-        int                     *committed);    /* xact committed or not */
-/*
 * Returns the file-relative block number of the first unused block in the file.
 * This is the lowest-address hole if the file has holes, else the first block
 * past the end of file.
@@ -344,14 +331,43 @@ xfs_bunmapi(
        int                     *done);         /* set if not done yet */
 /*
- * Fcntl interface to xfs_bmapi.
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+        struct xfs_ifork        *ifp,
+        xfs_extnum_t            idx,
+        xfs_extnum_t            num);
+#ifdef __KERNEL__
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int                                             /* error */
+xfs_bmap_finish(
+        struct xfs_trans        **tp,           /* transaction pointer addr */
+        xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+        int                     *committed);    /* xact committed or not */
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+/*
+ * Get inode's extents as described in bmv, and format for output.
 */
 int                                             /* error code */
 xfs_getbmap(
        xfs_inode_t             *ip,
-        struct getbmap          *bmv,           /* user bmap structure */
+        struct getbmapx         *bmv,           /* user bmap structure */
-        void                    __user *ap,     /* pointer to user's array */
+        xfs_bmap_format_t       formatter,      /* format to user */
-        int                     iflags);        /* interface flags */
+        void                    *arg);          /* formatter arg */
 /*
 * Check if the endoff is outside the last extent. If so the caller will grow
@@ -375,16 +391,6 @@ xfs_bmap_count_blocks(
        int                     *count);
 /*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
-        struct xfs_ifork        *ifp,
-        xfs_extnum_t            idx,
-        xfs_extnum_t            num);
-/*
 * Search the extent records for the entry containing block bno.
 * If bno lies in a hole, point to the next entry.  If bno lies
 * past eof, *eofp will be set, and *prevp will contain the last
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 23efad29a5cd..8f1ec73725d3 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -37,1406 +37,13 @@
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
-#if defined(XFS_BMBT_TRACE)
-ktrace_t        *xfs_bmbt_trace_buf;
-#endif
-/*
- * Prototypes for internal btree functions.
- */
-STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
-STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
-                __uint64_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
-#if defined(XFS_BMBT_TRACE)
-static char     ARGS[] = "args";
-static char     ENTRY[] = "entry";
-static char     ERROR[] = "error";
-#undef EXIT
-static char     EXIT[] = "exit";
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-STATIC void
-xfs_bmbt_trace_enter(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        char            *s,
-        int             type,
-        int             line,
-        __psunsigned_t  a0,
-        __psunsigned_t  a1,
-        __psunsigned_t  a2,
-        __psunsigned_t  a3,
-        __psunsigned_t  a4,
-        __psunsigned_t  a5,
-        __psunsigned_t  a6,
-        __psunsigned_t  a7,
-        __psunsigned_t  a8,
-        __psunsigned_t  a9,
-        __psunsigned_t  a10)
-{
-        xfs_inode_t     *ip;
-        int             whichfork;
-        ip = cur->bc_private.b.ip;
-        whichfork = cur->bc_private.b.whichfork;
-        ktrace_enter(xfs_bmbt_trace_buf,
-                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-                (void *)func, (void *)s, (void *)ip, (void *)cur,
-                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-                (void *)a8, (void *)a9, (void *)a10);
-        ASSERT(ip->i_btrace);
-        ktrace_enter(ip->i_btrace,
-                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-                (void *)func, (void *)s, (void *)ip, (void *)cur,
-                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-                (void *)a8, (void *)a9, (void *)a10);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argbi(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *b,
-        int             i,
-        int             line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
-                (__psunsigned_t)b, i, 0, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
- */
-STATIC void
-xfs_bmbt_trace_argbii(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *b,
-        int             i0,
-        int             i1,
-        int             line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
-                (__psunsigned_t)b, i0, i1, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for 3 block-length args
- * and an integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argfffi(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        xfs_dfiloff_t           o,
-        xfs_dfsbno_t            b,
-        xfs_dfilblks_t          i,
-        int                     j,
-        int                     line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
-                o >> 32, (int)o, b >> 32, (int)b,
-                i >> 32, (int)i, (int)j, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for one integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argi(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        int             i,
-        int             line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
-                i, 0, 0, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, key.
- */
-STATIC void
-xfs_bmbt_trace_argifk(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        int                     i,
-        xfs_fsblock_t           f,
-        xfs_dfiloff_t           o,
-        int                     line)
-{
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-                i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32,
-                (int)o, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, rec.
- */
-STATIC void
-xfs_bmbt_trace_argifr(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        int                     i,
-        xfs_fsblock_t           f,
-        xfs_bmbt_rec_t          *r,
-        int                     line)
-{
-        xfs_dfsbno_t            b;
-        xfs_dfilblks_t          c;
-        xfs_dfsbno_t            d;
-        xfs_dfiloff_t           o;
-        xfs_bmbt_irec_t         s;
-        d = (xfs_dfsbno_t)f;
-        xfs_bmbt_disk_get_all(r, &s);
-        o = (xfs_dfiloff_t)s.br_startoff;
-        b = (xfs_dfsbno_t)s.br_startblock;
-        c = s.br_blockcount;
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
-                i, d >> 32, (int)d, o >> 32,
-                (int)o, b >> 32, (int)b, c >> 32,
-                (int)c, 0, 0);
-}
-/*
- * Add a trace buffer entry for arguments, for int, key.
- */
-STATIC void
-xfs_bmbt_trace_argik(
-        const char              *func,
-        xfs_btree_cur_t         *cur,
-        int                     i,
-        xfs_bmbt_key_t          *k,
-        int                     line)
-{
-        xfs_dfiloff_t           o;
-        o = be64_to_cpu(k->br_startoff);
-        xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-                i, o >> 32, (int)o, 0,
-                0, 0, 0, 0,
-                0, 0, 0);
-}
-/*
- * Add a trace buffer entry for the cursor/operation.
- */
-STATIC void
-xfs_bmbt_trace_cursor(
-        const char      *func,
-        xfs_btree_cur_t *cur,
-        char            *s,
-        int             line)
-{
-        xfs_bmbt_rec_host_t     r;
-        xfs_bmbt_set_all(&r, &cur->bc_rec.b);
-        xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
-                (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
-                cur->bc_private.b.allocated,
-                r.l0 >> 32, (int)r.l0,
-                r.l1 >> 32, (int)r.l1,
-                (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
-                (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
-                (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
-                (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
-}
-#define XFS_BMBT_TRACE_ARGBI(c,b,i)     \
-        xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)  \
-        xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)       \
-        xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGI(c,i)        \
-        xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)  \
-        xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)  \
-        xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k)     \
-        xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
-#define XFS_BMBT_TRACE_CURSOR(c,s)      \
-        xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
-#else
-#define XFS_BMBT_TRACE_ARGBI(c,b,i)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
-#define XFS_BMBT_TRACE_ARGI(c,i)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k)
-#define XFS_BMBT_TRACE_CURSOR(c,s)
-#endif  /* XFS_BMBT_TRACE */
-/*
- * Internal functions.
- */
-/*
- * Delete record pointed to by cur/level.
- */
-STATIC int                                      /* error */
-xfs_bmbt_delrec(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block;         /* bmap btree block */
-        xfs_fsblock_t           bno;            /* fs-relative block number */
-        xfs_buf_t               *bp;            /* buffer for block */
-        int                     error;          /* error return value */
-        int                     i;              /* loop counter */
-        int                     j;              /* temp state */
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-        xfs_fsblock_t           lbno;           /* left sibling block number */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp;           /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        int                     lrecs=0;        /* left record count */
-        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-        xfs_mount_t             *mp;            /* file system mount point */
-        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-        int                     ptr;            /* key/record index */
-        xfs_fsblock_t           rbno;           /* right sibling block number */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp;           /* right btree key */
-        xfs_bmbt_rec_t          *rp;            /* pointer to bmap btree rec */
-        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-        xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-        xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-        int                     rrecs=0;        /* right record count */
-        xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-        xfs_btree_cur_t         *tcur;          /* temporary btree cursor */
-        int                     numrecs;        /* temporary numrec count */
-        int                     numlrecs, numrrecs;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        ptr = cur->bc_ptrs[level];
-        tcur = NULL;
-        if (ptr == 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        block = xfs_bmbt_get_block(cur, level, &bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-#endif
-        if (ptr > numrecs) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        XFS_STATS_INC(xs_bmbt_delrec);
-        if (level > 0) {
-                kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                }
-#endif
-                if (ptr < numrecs) {
-                        memmove(&kp[ptr - 1], &kp[ptr],
-                                (numrecs - ptr) * sizeof(*kp));
-                        memmove(&pp[ptr - 1], &pp[ptr],
-                                (numrecs - ptr) * sizeof(*pp));
-                        xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
-                        xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
-                }
-        } else {
-                rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-                if (ptr < numrecs) {
-                        memmove(&rp[ptr - 1], &rp[ptr],
-                                (numrecs - ptr) * sizeof(*rp));
-                        xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
-                }
-                if (ptr == 1) {
-                        key.br_startoff =
-                                cpu_to_be64(xfs_bmbt_disk_get_startoff(rp));
-                        kp = &key;
-                }
-        }
-        numrecs--;
-        block->bb_numrecs = cpu_to_be16(numrecs);
-        xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-        /*
-         * We're at the root level.
-         * First, shrink the root block in-memory.
-         * Try to get rid of the next level down.
-         * If we can't then there's nothing left to do.
-         */
-        if (level == cur->bc_nlevels - 1) {
-                xfs_iroot_realloc(cur->bc_private.b.ip, -1,
-                        cur->bc_private.b.whichfork);
-                if ((error = xfs_bmbt_killroot(cur))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-        if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        rbno = be64_to_cpu(block->bb_rightsib);
-        lbno = be64_to_cpu(block->bb_leftsib);
-        /*
-         * One child of root, need to get a chance to copy its contents
-         * into the root and delete it. Can't go up to next level,
-         * there's nothing to delete there.
-         */
-        if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
-            level == cur->bc_nlevels - 2) {
-                if ((error = xfs_bmbt_killroot(cur))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
-        if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-        bno = NULLFSBLOCK;
-        if (rbno != NULLFSBLOCK) {
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                rbp = tcur->bc_bufs[level];
-                right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-#endif
-                bno = be64_to_cpu(right->bb_leftsib);
-                if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                        if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                                tcur = NULL;
-                                if (level > 0) {
-                                        if ((error = xfs_bmbt_decrement(cur,
-                                                        level, &i))) {
-                                                XFS_BMBT_TRACE_CURSOR(cur,
-                                                        ERROR);
-                                                goto error0;
-                                        }
-                                }
-                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if (lbno != NULLFSBLOCK) {
-                        i = xfs_btree_firstrec(tcur, level);
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                }
-        }
-        if (lbno != NULLFSBLOCK) {
-                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                /*
-                 * decrement to last in block
-                 */
-                if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                lbp = tcur->bc_bufs[level];
-                left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-#endif
-                bno = be64_to_cpu(left->bb_rightsib);
-                if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                    XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                        if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                                tcur = NULL;
-                                if (level == 0)
-                                        cur->bc_ptrs[0]++;
-                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        }
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        tcur = NULL;
-        mp = cur->bc_mp;
-        ASSERT(bno != NULLFSBLOCK);
-        if (lbno != NULLFSBLOCK &&
-            lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                rbno = bno;
-                right = block;
-                rbp = bp;
-                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-                if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-        } else if (rbno != NULLFSBLOCK &&
-                   rrecs + be16_to_cpu(block->bb_numrecs) <=
-                   XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                lbno = bno;
-                left = block;
-                lbp = bp;
-                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-                if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        } else {
-                if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        numlrecs = be16_to_cpu(left->bb_numrecs);
-        numrrecs = be16_to_cpu(right->bb_numrecs);
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
-                lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < numrrecs; i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                goto error0;
-                        }
-                }
-#endif
-                memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
-                xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-                xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
-                xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-        }
-        be16_add_cpu(&left->bb_numrecs, numrrecs);
-        left->bb_rightsib = right->bb_rightsib;
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
-        if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) {
-                if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
-                                be64_to_cpu(left->bb_rightsib),
-                                0, &rrbp, XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-                if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        goto error0;
-                }
-                rrblock->bb_leftsib = cpu_to_be64(lbno);
-                xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-        }
-        xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
-                cur->bc_private.b.flist, mp);
-        cur->bc_private.b.ip->i_d.di_nblocks--;
-        xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
-                        XFS_TRANS_DQ_BCOUNT, -1L);
-        xfs_trans_binval(cur->bc_tp, rbp);
-        if (bp != lbp) {
-                cur->bc_bufs[level] = lbp;
-                cur->bc_ptrs[level] += lrecs;
-                cur->bc_ra[level] = 0;
-        } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                goto error0;
-        }
-        if (level > 0)
-                cur->bc_ptrs[level]--;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 2;
-        return 0;
-error0:
-        if (tcur)
-                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
-}
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int                                      /* error */
-xfs_bmbt_insrec(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        xfs_fsblock_t           *bnop,
-        xfs_bmbt_rec_t          *recp,
-        xfs_btree_cur_t         **curp,
-        int                     *stat)          /* no-go/done/continue */
-{
-        xfs_bmbt_block_t        *block;         /* bmap btree block */
-        xfs_buf_t               *bp;            /* buffer for block */
-        int                     error;          /* error return value */
-        int                     i;              /* loop index */
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-        int                     logflags;       /* inode logging flags */
-        xfs_fsblock_t           nbno;           /* new block number */
-        struct xfs_btree_cur    *ncur;          /* new btree cursor */
-        __uint64_t              startoff;       /* new btree key value */
-        xfs_bmbt_rec_t          nrec;           /* new record count */
-        int                     optr;           /* old key/record index */
-        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-        int                     ptr;            /* key/record index */
-        xfs_bmbt_rec_t          *rp=NULL;       /* pointer to bmap btree rec */
-        int                     numrecs;
-        ASSERT(level < cur->bc_nlevels);
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
-        ncur = NULL;
-        key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp));
-        optr = ptr = cur->bc_ptrs[level];
-        if (ptr == 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        XFS_STATS_INC(xs_bmbt_insrec);
-        block = xfs_bmbt_get_block(cur, level, &bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (ptr <= numrecs) {
-                if (level == 0) {
-                        rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
-                } else {
-                        kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-                        xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
-                }
-        }
-#endif
-        nbno = NULLFSBLOCK;
-        if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-                        /*
-                         * A root block, that can be made bigger.
-                         */
-                        xfs_iroot_realloc(cur->bc_private.b.ip, 1,
-                                cur->bc_private.b.whichfork);
-                        block = xfs_bmbt_get_block(cur, level, &bp);
-                } else if (level == cur->bc_nlevels - 1) {
-                        if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
-                            *stat == 0) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                        xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-                                logflags);
-                        block = xfs_bmbt_get_block(cur, level, &bp);
-                } else {
-                        if ((error = xfs_bmbt_rshift(cur, level, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                        if (i) {
-                                /* nothing */
-                        } else {
-                                if ((error = xfs_bmbt_lshift(cur, level, &i))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                                if (i) {
-                                        optr = ptr = cur->bc_ptrs[level];
-                                } else {
-                                        if ((error = xfs_bmbt_split(cur, level,
-                                                        &nbno, &startoff, &ncur,
-                                                        &i))) {
-                                                XFS_BMBT_TRACE_CURSOR(cur,
-                                                        ERROR);
-                                                return error;
-                                        }
-                                        if (i) {
-                                                block = xfs_bmbt_get_block(
-                                                            cur, level, &bp);
-#ifdef DEBUG
-                                                if ((error =
-                                                    xfs_btree_check_lblock(cur,
-                                                            block, level, bp))) {
-                                                        XFS_BMBT_TRACE_CURSOR(
-                                                                cur, ERROR);
-                                                        return error;
-                                                }
-#endif
-                                                ptr = cur->bc_ptrs[level];
-                                                xfs_bmbt_disk_set_allf(&nrec,
-                                                        startoff, 0, 0,
-                                                        XFS_EXT_NORM);
-                                        } else {
-                                                XFS_BMBT_TRACE_CURSOR(cur,
-                                                        EXIT);
-                                                *stat = 0;
-                                                return 0;
-                                        }
-                                }
-                        }
-                }
-        }
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (level > 0) {
-                kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1],
-                                        level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memmove(&kp[ptr], &kp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*kp));
-                memmove(&pp[ptr], &pp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr(cur, *bnop, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                kp[ptr - 1] = key;
-                pp[ptr - 1] = cpu_to_be64(*bnop);
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
-                xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
-        } else {
-                rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-                memmove(&rp[ptr], &rp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*rp));
-                rp[ptr - 1] = *recp;
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
-        }
-        xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        if (ptr < numrecs) {
-                if (level == 0)
-                        xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
-                                rp + ptr);
-                else
-                        xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
-                                kp + ptr);
-        }
-#endif
-        if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        *bnop = nbno;
-        if (nbno != NULLFSBLOCK) {
-                *recp = nrec;
-                *curp = ncur;
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-STATIC int
-xfs_bmbt_killroot(
-        xfs_btree_cur_t         *cur)
-{
-        xfs_bmbt_block_t        *block;
-        xfs_bmbt_block_t        *cblock;
-        xfs_buf_t               *cbp;
-        xfs_bmbt_key_t          *ckp;
-        xfs_bmbt_ptr_t          *cpp;
-#ifdef DEBUG
-        int                     error;
-#endif
-        int                     i;
-        xfs_bmbt_key_t          *kp;
-        xfs_inode_t             *ip;
-        xfs_ifork_t             *ifp;
-        int                     level;
-        xfs_bmbt_ptr_t          *pp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        level = cur->bc_nlevels - 1;
-        ASSERT(level >= 1);
-        /*
-         * Don't deal with the root block needs to be a leaf case.
-         * We're just going to turn the thing back into extents anyway.
-         */
-        if (level == 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        block = xfs_bmbt_get_block(cur, level, &cbp);
-        /*
-         * Give up if the root has multiple children.
-         */
-        if (be16_to_cpu(block->bb_numrecs) != 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        /*
-         * Only do this if the next level will fit.
-         * Then the data must be copied up to the inode,
-         * instead of freeing the root you free the next level.
-         */
-        cbp = cur->bc_bufs[level - 1];
-        cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
-        if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO);
-        ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO);
-        ip = cur->bc_private.b.ip;
-        ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
-        ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
-               XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
-        i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
-        if (i) {
-                xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
-                block = ifp->if_broot;
-        }
-        be16_add_cpu(&block->bb_numrecs, i);
-        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-        ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-        memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp));
-        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-                if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-        }
-#endif
-        memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp));
-        xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
-                        cur->bc_private.b.flist, cur->bc_mp);
-        ip->i_d.di_nblocks--;
-        XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
-                        XFS_TRANS_DQ_BCOUNT, -1L);
-        xfs_trans_binval(cur->bc_tp, cbp);
-        cur->bc_bufs[level - 1] = NULL;
-        be16_add_cpu(&block->bb_level, -1);
-        xfs_trans_log_inode(cur->bc_tp, ip,
-                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        cur->bc_nlevels--;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        return 0;
-}
-/*
- * Log key values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_keys(
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *bp,
-        int             kfirst,
-        int             klast)
-{
-        xfs_trans_t     *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
-        tp = cur->bc_tp;
-        if (bp) {
-                xfs_bmbt_block_t        *block;
-                int                     first;
-                xfs_bmbt_key_t          *kp;
-                int                     last;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
-                first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-                last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-                xfs_trans_log_buf(tp, bp, first, last);
-        } else {
-                xfs_inode_t              *ip;
-                ip = cur->bc_private.b.ip;
-                xfs_trans_log_inode(tp, ip,
-                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-/*
- * Log pointer values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_ptrs(
-        xfs_btree_cur_t *cur,
-        xfs_buf_t       *bp,
-        int             pfirst,
-        int             plast)
-{
-        xfs_trans_t     *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
-        tp = cur->bc_tp;
-        if (bp) {
-                xfs_bmbt_block_t        *block;
-                int                     first;
-                int                     last;
-                xfs_bmbt_ptr_t          *pp;
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
-                first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-                last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-                xfs_trans_log_buf(tp, bp, first, last);
-        } else {
-                xfs_inode_t             *ip;
-                ip = cur->bc_private.b.ip;
-                xfs_trans_log_inode(tp, ip,
-                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- */
-STATIC int                              /* error */
-xfs_bmbt_lookup(
-        xfs_btree_cur_t         *cur,
-        xfs_lookup_t            dir,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block=NULL;
-        xfs_buf_t               *bp;
-        xfs_daddr_t             d;
-        xfs_sfiloff_t           diff;
-        int                     error;          /* error return value */
-        xfs_fsblock_t           fsbno=0;
-        int                     high;
-        int                     i;
-        int                     keyno=0;
-        xfs_bmbt_key_t          *kkbase=NULL;
-        xfs_bmbt_key_t          *kkp;
-        xfs_bmbt_rec_t          *krbase=NULL;
-        xfs_bmbt_rec_t          *krp;
-        int                     level;
-        int                     low;
-        xfs_mount_t             *mp;
-        xfs_bmbt_ptr_t          *pp;
-        xfs_bmbt_irec_t         *rp;
-        xfs_fileoff_t           startoff;
-        xfs_trans_t             *tp;
-        XFS_STATS_INC(xs_bmbt_lookup);
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, (int)dir);
-        tp = cur->bc_tp;
-        mp = cur->bc_mp;
-        rp = &cur->bc_rec.b;
-        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-                if (level < cur->bc_nlevels - 1) {
-                        d = XFS_FSB_TO_DADDR(mp, fsbno);
-                        bp = cur->bc_bufs[level];
-                        if (bp && XFS_BUF_ADDR(bp) != d)
-                                bp = NULL;
-                        if (!bp) {
-                                if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
-                                                0, &bp, XFS_BMAP_BTREE_REF))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                                xfs_btree_setbuf(cur, level, bp);
-                                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                                if ((error = xfs_btree_check_lblock(cur, block,
-                                                level, bp))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                        } else
-                                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                } else
-                        block = xfs_bmbt_get_block(cur, level, &bp);
-                if (diff == 0)
-                        keyno = 1;
-                else {
-                        if (level > 0)
-                                kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                        else
-                                krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
-                        low = 1;
-                        if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                                ASSERT(level == 0);
-                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                                *stat = 0;
-                                return 0;
-                        }
-                        while (low <= high) {
-                                XFS_STATS_INC(xs_bmbt_compare);
-                                keyno = (low + high) >> 1;
-                                if (level > 0) {
-                                        kkp = kkbase + keyno - 1;
-                                        startoff = be64_to_cpu(kkp->br_startoff);
-                                } else {
-                                        krp = krbase + keyno - 1;
-                                        startoff = xfs_bmbt_disk_get_startoff(krp);
-                                }
-                                diff = (xfs_sfiloff_t)
-                                                (startoff - rp->br_startoff);
-                                if (diff < 0)
-                                        low = keyno + 1;
-                                else if (diff > 0)
-                                        high = keyno - 1;
-                                else
-                                        break;
-                        }
-                }
-                if (level > 0) {
-                        if (diff > 0 && --keyno < 1)
-                                keyno = 1;
-                        pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
-                        fsbno = be64_to_cpu(*pp);
-#ifdef DEBUG
-                        if ((error = xfs_btree_check_lptr(cur, fsbno, level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-#endif
-                        cur->bc_ptrs[level] = keyno;
-                }
-        }
-        if (dir != XFS_LOOKUP_LE && diff < 0) {
-                keyno++;
-                /*
-                 * If ge search and we went off the end of the block, but it's
-                 * not the last block, we're in the wrong block.
-                 */
-                if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) &&
-                    be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) {
-                        cur->bc_ptrs[0] = keyno;
-                        if ((error = xfs_bmbt_increment(cur, 0, &i))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
-                        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                        *stat = 1;
-                        return 0;
-                }
-        }
-        else if (dir == XFS_LOOKUP_LE && diff > 0)
-                keyno--;
-        cur->bc_ptrs[0] = keyno;
-        if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-        } else {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-        }
-        return 0;
-}
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                      /* error */
-xfs_bmbt_lshift(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        int                     error;          /* error return value */
-#ifdef DEBUG
-        int                     i;              /* loop counter */
-#endif
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp=NULL;      /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        int                     lrecs;          /* left record count */
-        xfs_bmbt_rec_t          *lrp=NULL;      /* left record pointer */
-        xfs_mount_t             *mp;            /* file system mount point */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp=NULL;      /* right btree key */
-        xfs_bmbt_ptr_t          *rpp=NULL;      /* right address pointer */
-        xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-        int                     rrecs;          /* right record count */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        if (level == cur->bc_nlevels - 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        rbp = cur->bc_bufs[level];
-        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        if (cur->bc_ptrs[level] <= 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        mp = cur->bc_mp;
-        if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0,
-                        &lbp, XFS_BMAP_BTREE_REF))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-        if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        lrecs = be16_to_cpu(left->bb_numrecs) + 1;
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                *lkp = *rkp;
-                xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
-                lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                *lpp = *rpp;
-                xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                *lrp = *rrp;
-                xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
-        }
-        left->bb_numrecs = cpu_to_be16(lrecs);
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
-        else
-                xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
-#endif
-        rrecs = be16_to_cpu(right->bb_numrecs) - 1;
-        right->bb_numrecs = cpu_to_be16(rrecs);
-        xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-        if (level > 0) {
-#ifdef DEBUG
-                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1],
-                                        level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
-                memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
-                xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
-                xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
-        } else {
-                memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
-                xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
-                key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-                rkp = &key;
-        }
-        if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        cur->bc_ptrs[level]--;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                      /* error */
-xfs_bmbt_rshift(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        int                     error;          /* error return value */
-        int                     i;              /* loop counter */
-        xfs_bmbt_key_t          key;            /* bmap btree key */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp;           /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-        xfs_mount_t             *mp;            /* file system mount point */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp;           /* right btree key */
-        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-        xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        if (level == cur->bc_nlevels - 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        lbp = cur->bc_bufs[level];
-        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        mp = cur->bc_mp;
-        if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0,
-                        &rbp, XFS_BMAP_BTREE_REF))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-        if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                *rkp = *lkp;
-                *rpp = *lpp;
-                xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                *rrp = *lrp;
-                xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-                rkp = &key;
-        }
-        be16_add_cpu(&left->bb_numrecs, -1);
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
-        else
-                xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
-#endif
-        xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-        if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        i = xfs_btree_lastrec(tcur, level);
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-                XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-                goto error1;
-        }
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
-                XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-                goto error1;
-        }
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-error0:
-        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-error1:
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-        return error;
-}
 /*
 * Determine the extent state.
 */
@@ -1453,229 +60,15 @@ xfs_extent_state(
        return XFS_EXT_NORM;
 }
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                                      /* error */
-xfs_bmbt_split(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        xfs_fsblock_t           *bnop,
-        __uint64_t              *startoff,
-        xfs_btree_cur_t         **curp,
-        int                     *stat)          /* success/failure */
-{
-        xfs_alloc_arg_t         args;           /* block allocation args */
-        int                     error;          /* error return value */
-        int                     i;              /* loop counter */
-        xfs_fsblock_t           lbno;           /* left sibling block number */
-        xfs_buf_t               *lbp;           /* left buffer pointer */
-        xfs_bmbt_block_t        *left;          /* left btree block */
-        xfs_bmbt_key_t          *lkp;           /* left btree key */
-        xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-        xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-        xfs_buf_t               *rbp;           /* right buffer pointer */
-        xfs_bmbt_block_t        *right;         /* right btree block */
-        xfs_bmbt_key_t          *rkp;           /* right btree key */
-        xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-        xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-        xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-        xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff);
-        args.tp = cur->bc_tp;
-        args.mp = cur->bc_mp;
-        lbp = cur->bc_bufs[level];
-        lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
-        left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-        args.fsbno = cur->bc_private.b.firstblock;
-        args.firstblock = args.fsbno;
-        args.minleft = 0;
-        if (args.fsbno == NULLFSBLOCK) {
-                args.fsbno = lbno;
-                args.type = XFS_ALLOCTYPE_START_BNO;
-                /*
-                 * Make sure there is sufficient room left in the AG to
-                 * complete a full tree split for an extent insert.  If
-                 * we are converting the middle part of an extent then
-                 * we may need space for two tree splits.
-                 *
-                 * We are relying on the caller to make the correct block
-                 * reservation for this operation to succeed.  If the
-                 * reservation amount is insufficient then we may fail a
-                 * block allocation here and corrupt the filesystem.
-                 */
-                args.minleft = xfs_trans_get_block_res(args.tp);
-        } else if (cur->bc_private.b.flist->xbf_low)
-                args.type = XFS_ALLOCTYPE_START_BNO;
-        else
-                args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        args.mod = args.alignment = args.total = args.isfl =
-                args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return XFS_ERROR(ENOSPC);
-        }
-        if ((error = xfs_alloc_vextent(&args))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (args.fsbno == NULLFSBLOCK && args.minleft) {
-                /*
-                 * Could not find an AG with enough free space to satisfy
-                 * a full btree split.  Try again without minleft and if
-                 * successful activate the lowspace algorithm.
-                 */
-                args.fsbno = 0;
-                args.type = XFS_ALLOCTYPE_FIRST_AG;
-                args.minleft = 0;
-                if ((error = xfs_alloc_vextent(&args))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                cur->bc_private.b.flist->xbf_low = 1;
-        }
-        if (args.fsbno == NULLFSBLOCK) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        cur->bc_private.b.firstblock = args.fsbno;
-        cur->bc_private.b.allocated++;
-        cur->bc_private.b.ip->i_d.di_nblocks++;
-        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                        XFS_TRANS_DQ_BCOUNT, 1L);
-        rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
-        right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
-        right->bb_level = left->bb_level;
-        right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-        if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-            cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-                be16_add_cpu(&right->bb_numrecs, 1);
-        i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-        if (level > 0) {
-                lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
-                lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
-                rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-                rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) {
-                                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                return error;
-                        }
-                }
-#endif
-                memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *startoff = be64_to_cpu(rkp->br_startoff);
-        } else {
-                lrp = XFS_BMAP_REC_IADDR(left, i, cur);
-                rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *startoff = xfs_bmbt_disk_get_startoff(rrp);
-        }
-        be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-        right->bb_rightsib = left->bb_rightsib;
-        left->bb_rightsib = cpu_to_be64(args.fsbno);
-        right->bb_leftsib = cpu_to_be64(lbno);
-        xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
-        xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) {
-                if ((error = xfs_btree_read_bufl(args.mp, args.tp,
-                                be64_to_cpu(right->bb_rightsib), 0, &rrbp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-                if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                rrblock->bb_leftsib = cpu_to_be64(args.fsbno);
-                xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-        }
-        if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-                xfs_btree_setbuf(cur, level, rbp);
-                cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-        }
-        if (level + 1 < cur->bc_nlevels) {
-                if ((error = xfs_btree_dup_cursor(cur, curp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                (*curp)->bc_ptrs[level + 1]++;
-        }
-        *bnop = args.fsbno;
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Update keys for the record.
- */
-STATIC int
-xfs_bmbt_updkey(
-        xfs_btree_cur_t         *cur,
-        xfs_bmbt_key_t          *keyp,  /* on-disk format */
-        int                     level)
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-#ifdef DEBUG
-        int                     error;
-#endif
-        xfs_bmbt_key_t          *kp;
-        int                     ptr;
-        ASSERT(level >= 1);
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
-        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-                block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                ptr = cur->bc_ptrs[level];
-                kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-                *kp = *keyp;
-                xfs_bmbt_log_keys(cur, bp, ptr, ptr);
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        return 0;
-}
 /*
 * Convert on-disk form of btree root to in-memory form.
 */
 void
 xfs_bmdr_to_bmbt(
+        struct xfs_mount        *mp,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen,
-        xfs_bmbt_block_t        *rblock,
+        struct xfs_btree_block  *rblock,
        int                     rblocklen)
 {
        int                     dmxr;
@@ -1688,129 +81,19 @@ xfs_bmdr_to_bmbt(
        rblock->bb_level = dblock->bb_level;
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        rblock->bb_numrecs = dblock->bb_numrecs;
-        rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
+        rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
-        rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+        rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
-        dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+        dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
-        fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
+        fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-        tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+        tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-        fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
+        fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
-        tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+        tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 /*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                             /* error */
-xfs_bmbt_decrement(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-        int                     error;          /* error return value */
-        xfs_fsblock_t           fsbno;
-        int                     lev;
-        xfs_mount_t             *mp;
-        xfs_trans_t             *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        ASSERT(level < cur->bc_nlevels);
-        if (level < cur->bc_nlevels - 1)
-                xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-        if (--cur->bc_ptrs[level] > 0) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                if (--cur->bc_ptrs[lev] > 0)
-                        break;
-                if (lev < cur->bc_nlevels - 1)
-                        xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-        }
-        if (lev == cur->bc_nlevels) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        tp = cur->bc_tp;
-        mp = cur->bc_mp;
-        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-                fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Delete the record pointed to by cur.
- */
-int                                     /* error */
-xfs_bmbt_delete(
-        xfs_btree_cur_t *cur,
-        int             *stat)          /* success/failure */
-{
-        int             error;          /* error return value */
-        int             i;
-        int             level;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        for (level = 0, i = 2; i == 2; level++) {
-                if ((error = xfs_bmbt_delrec(cur, level, &i))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-        }
-        if (i == 0) {
-                for (level = 1; level < cur->bc_nlevels; level++) {
-                        if (cur->bc_ptrs[level] == 0) {
-                                if ((error = xfs_bmbt_decrement(cur, level,
-                                                &i))) {
-                                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                        return error;
-                                }
-                                break;
-                        }
-                }
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = i;
-        return 0;
-}
-/*
 * Convert a compressed bmap extent record to an uncompressed form.
 * This code must be in sync with the routines xfs_bmbt_get_startoff,
 * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
@@ -1864,31 +147,6 @@ xfs_bmbt_get_all(
 }
 /*
- * Get the block pointer for the given level of the cursor.
- * Fill in the buffer pointer, if applicable.
- */
-xfs_bmbt_block_t *
-xfs_bmbt_get_block(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        xfs_buf_t               **bpp)
-{
-        xfs_ifork_t             *ifp;
-        xfs_bmbt_block_t        *rval;
-        if (level < cur->bc_nlevels - 1) {
-                *bpp = cur->bc_bufs[level];
-                rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
-        } else {
-                *bpp = NULL;
-                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
-                        cur->bc_private.b.whichfork);
-                rval = ifp->if_broot;
-        }
-        return rval;
-}
-/*
 * Extract the blockcount field from an in memory bmap extent record.
 */
 xfs_filblks_t
@@ -1950,7 +208,8 @@ xfs_bmbt_disk_get_all(
        xfs_bmbt_rec_t  *r,
        xfs_bmbt_irec_t *s)
 {
-        __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s);
+        __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+                                get_unaligned_be64(&r->l1), s);
 }
 /*
@@ -1974,348 +233,6 @@ xfs_bmbt_disk_get_startoff(
                 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                             /* error */
-xfs_bmbt_increment(
-        xfs_btree_cur_t         *cur,
-        int                     level,
-        int                     *stat)          /* success/failure */
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-        int                     error;          /* error return value */
-        xfs_fsblock_t           fsbno;
-        int                     lev;
-        xfs_mount_t             *mp;
-        xfs_trans_t             *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGI(cur, level);
-        ASSERT(level < cur->bc_nlevels);
-        if (level < cur->bc_nlevels - 1)
-                xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-        block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 1;
-                return 0;
-        }
-        if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                block = xfs_bmbt_get_block(cur, lev, &bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                        break;
-                if (lev < cur->bc_nlevels - 1)
-                        xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-        }
-        if (lev == cur->bc_nlevels) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        tp = cur->bc_tp;
-        mp = cur->bc_mp;
-        for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-                fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                                XFS_BMAP_BTREE_REF))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                cur->bc_ptrs[lev] = 1;
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = 1;
-        return 0;
-}
-/*
- * Insert the current record at the point referenced by cur.
- *
- * A multi-level split of the tree on insert will invalidate the original
- * cursor.  All callers of this function should assume that the cursor is
- * no longer valid and revalidate it.
- */
-int                                     /* error */
-xfs_bmbt_insert(
-        xfs_btree_cur_t *cur,
-        int             *stat)          /* success/failure */
-{
-        int             error;          /* error return value */
-        int             i;
-        int             level;
-        xfs_fsblock_t   nbno;
-        xfs_btree_cur_t *ncur;
-        xfs_bmbt_rec_t  nrec;
-        xfs_btree_cur_t *pcur;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        level = 0;
-        nbno = NULLFSBLOCK;
-        xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
-        ncur = NULL;
-        pcur = cur;
-        do {
-                if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                                &i))) {
-                        if (pcur != cur)
-                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
-                        cur->bc_nlevels = pcur->bc_nlevels;
-                        cur->bc_private.b.allocated +=
-                                pcur->bc_private.b.allocated;
-                        pcur->bc_private.b.allocated = 0;
-                        ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
-                               XFS_IS_REALTIME_INODE(cur->bc_private.b.ip));
-                        cur->bc_private.b.firstblock =
-                                pcur->bc_private.b.firstblock;
-                        ASSERT(cur->bc_private.b.flist ==
-                               pcur->bc_private.b.flist);
-                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-                }
-                if (ncur) {
-                        pcur = ncur;
-                        ncur = NULL;
-                }
-        } while (nbno != NULLFSBLOCK);
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *stat = i;
-        return 0;
-error0:
-        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-        return error;
-}
-/*
- * Log fields from the btree block header.
- */
-void
-xfs_bmbt_log_block(
-        xfs_btree_cur_t         *cur,
-        xfs_buf_t               *bp,
-        int                     fields)
-{
-        int                     first;
-        int                     last;
-        xfs_trans_t             *tp;
-        static const short      offsets[] = {
-                offsetof(xfs_bmbt_block_t, bb_magic),
-                offsetof(xfs_bmbt_block_t, bb_level),
-                offsetof(xfs_bmbt_block_t, bb_numrecs),
-                offsetof(xfs_bmbt_block_t, bb_leftsib),
-                offsetof(xfs_bmbt_block_t, bb_rightsib),
-                sizeof(xfs_bmbt_block_t)
-        };
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
-        tp = cur->bc_tp;
-        if (bp) {
-                xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
-                                  &last);
-                xfs_trans_log_buf(tp, bp, first, last);
-        } else
-                xfs_trans_log_inode(tp, cur->bc_private.b.ip,
-                        XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-/*
- * Log record values from the btree block.
- */
-void
-xfs_bmbt_log_recs(
-        xfs_btree_cur_t         *cur,
-        xfs_buf_t               *bp,
-        int                     rfirst,
-        int                     rlast)
-{
-        xfs_bmbt_block_t        *block;
-        int                     first;
-        int                     last;
-        xfs_bmbt_rec_t          *rp;
-        xfs_trans_t             *tp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
-        ASSERT(bp);
-        tp = cur->bc_tp;
-        block = XFS_BUF_TO_BMBT_BLOCK(bp);
-        rp = XFS_BMAP_REC_DADDR(block, 1, cur);
-        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(tp, bp, first, last);
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-int                                     /* error */
-xfs_bmbt_lookup_eq(
-        xfs_btree_cur_t *cur,
-        xfs_fileoff_t   off,
-        xfs_fsblock_t   bno,
-        xfs_filblks_t   len,
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.b.br_startoff = off;
-        cur->bc_rec.b.br_startblock = bno;
-        cur->bc_rec.b.br_blockcount = len;
-        return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-int                                     /* error */
-xfs_bmbt_lookup_ge(
-        xfs_btree_cur_t *cur,
-        xfs_fileoff_t   off,
-        xfs_fsblock_t   bno,
-        xfs_filblks_t   len,
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.b.br_startoff = off;
-        cur->bc_rec.b.br_startblock = bno;
-        cur->bc_rec.b.br_blockcount = len;
-        return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-int                                             /* error */
-xfs_bmbt_newroot(
-        xfs_btree_cur_t         *cur,           /* btree cursor */
-        int                     *logflags,      /* logging flags for inode */
-        int                     *stat)          /* return status - 0 fail */
-{
-        xfs_alloc_arg_t         args;           /* allocation arguments */
-        xfs_bmbt_block_t        *block;         /* bmap btree block */
-        xfs_buf_t               *bp;            /* buffer for block */
-        xfs_bmbt_block_t        *cblock;        /* child btree block */
-        xfs_bmbt_key_t          *ckp;           /* child key pointer */
-        xfs_bmbt_ptr_t          *cpp;           /* child ptr pointer */
-        int                     error;          /* error return code */
-#ifdef DEBUG
-        int                     i;              /* loop counter */
-#endif
-        xfs_bmbt_key_t          *kp;            /* pointer to bmap btree key */
-        int                     level;          /* btree level */
-        xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        level = cur->bc_nlevels - 1;
-        block = xfs_bmbt_get_block(cur, level, &bp);
-        /*
-         * Copy the root into a real block.
-         */
-        args.mp = cur->bc_mp;
-        pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-        args.tp = cur->bc_tp;
-        args.fsbno = cur->bc_private.b.firstblock;
-        args.mod = args.minleft = args.alignment = args.total = args.isfl =
-                args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-        args.firstblock = args.fsbno;
-        if (args.fsbno == NULLFSBLOCK) {
-#ifdef DEBUG
-                if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-#endif
-                args.fsbno = be64_to_cpu(*pp);
-                args.type = XFS_ALLOCTYPE_START_BNO;
-        } else if (cur->bc_private.b.flist->xbf_low)
-                args.type = XFS_ALLOCTYPE_START_BNO;
-        else
-                args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        if ((error = xfs_alloc_vextent(&args))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        if (args.fsbno == NULLFSBLOCK) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        cur->bc_private.b.firstblock = args.fsbno;
-        cur->bc_private.b.allocated++;
-        cur->bc_private.b.ip->i_d.di_nblocks++;
-        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                          XFS_TRANS_DQ_BCOUNT, 1L);
-        bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
-        cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
-        *cblock = *block;
-        be16_add_cpu(&block->bb_level, 1);
-        block->bb_numrecs = cpu_to_be16(1);
-        cur->bc_nlevels++;
-        cur->bc_ptrs[level + 1] = 1;
-        kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-        ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-        memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp));
-        cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-                if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                        XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                        return error;
-                }
-        }
-#endif
-        memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        *pp = cpu_to_be64(args.fsbno);
-        xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
-                cur->bc_private.b.whichfork);
-        xfs_btree_setbuf(cur, level, bp);
-        /*
-         * Do all this logging at the end so that
-         * the root is at the right level.
-         */
-        xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
-        xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-        xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        *logflags |=
-                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
-        *stat = 1;
-        return 0;
-}
 /*
 * Set all the fields in a bmap extent record from the arguments.
@@ -2512,7 +429,8 @@ xfs_bmbt_set_state(
 */
 void
 xfs_bmbt_to_bmdr(
-        xfs_bmbt_block_t        *rblock,
+        struct xfs_mount        *mp,
+        struct xfs_btree_block  *rblock,
        int                     rblocklen,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen)
@@ -2524,67 +442,22 @@ xfs_bmbt_to_bmdr(
        __be64                  *tpp;
        ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
-        ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO);
+        ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO);
-        ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO);
+        ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO);
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        dblock->bb_level = rblock->bb_level;
        dblock->bb_numrecs = rblock->bb_numrecs;
-        dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
+        dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
-        fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
+        fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
-        tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
+        tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
-        fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+        fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
-        tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
+        tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 /*
- * Update the record to the passed values.
- */
-int
-xfs_bmbt_update(
-        xfs_btree_cur_t         *cur,
-        xfs_fileoff_t           off,
-        xfs_fsblock_t           bno,
-        xfs_filblks_t           len,
-        xfs_exntst_t            state)
-{
-        xfs_bmbt_block_t        *block;
-        xfs_buf_t               *bp;
-        int                     error;
-        xfs_bmbt_key_t          key;
-        int                     ptr;
-        xfs_bmbt_rec_t          *rp;
-        XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-        XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
-                (xfs_dfilblks_t)len, (int)state);
-        block = xfs_bmbt_get_block(cur, 0, &bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-#endif
-        ptr = cur->bc_ptrs[0];
-        rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-        xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
-        xfs_bmbt_log_recs(cur, bp, ptr, ptr);
-        if (ptr > 1) {
-                XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                return 0;
-        }
-        key.br_startoff = cpu_to_be64(off);
-        if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
-                XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                return error;
-        }
-        XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-        return 0;
-}
-/*
 * Check extent records, which have just been read, for
 * any bit in the extent flag field. ASSERT on debug
 * kernels, as this condition should not occur.
@@ -2608,3 +481,451 @@ xfs_check_nostate_extents(
        }
        return 0;
 }
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+        struct xfs_btree_cur    *cur)
+{
+        struct xfs_btree_cur    *new;
+        new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                        cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+        /*
+         * Copy the firstblock, flist, and flags values,
+         * since init cursor doesn't get them.
+         */
+        new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+        new->bc_private.b.flist = cur->bc_private.b.flist;
+        new->bc_private.b.flags = cur->bc_private.b.flags;
+        return new;
+}
+STATIC void
+xfs_bmbt_update_cursor(
+        struct xfs_btree_cur    *src,
+        struct xfs_btree_cur    *dst)
+{
+        ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+               (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+        ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+        dst->bc_private.b.allocated += src->bc_private.b.allocated;
+        dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+        src->bc_private.b.allocated = 0;
+}
+STATIC int
+xfs_bmbt_alloc_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *start,
+        union xfs_btree_ptr     *new,
+        int                     length,
+        int                     *stat)
+{
+        xfs_alloc_arg_t         args;           /* block allocation args */
+        int                     error;          /* error return value */
+        memset(&args, 0, sizeof(args));
+        args.tp = cur->bc_tp;
+        args.mp = cur->bc_mp;
+        args.fsbno = cur->bc_private.b.firstblock;
+        args.firstblock = args.fsbno;
+        if (args.fsbno == NULLFSBLOCK) {
+                args.fsbno = be64_to_cpu(start->l);
+                args.type = XFS_ALLOCTYPE_START_BNO;
+                /*
+                 * Make sure there is sufficient room left in the AG to
+                 * complete a full tree split for an extent insert.  If
+                 * we are converting the middle part of an extent then
+                 * we may need space for two tree splits.
+                 *
+                 * We are relying on the caller to make the correct block
+                 * reservation for this operation to succeed.  If the
+                 * reservation amount is insufficient then we may fail a
+                 * block allocation here and corrupt the filesystem.
+                 */
+                args.minleft = xfs_trans_get_block_res(args.tp);
+        } else if (cur->bc_private.b.flist->xbf_low) {
+                args.type = XFS_ALLOCTYPE_START_BNO;
+        } else {
+                args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        }
+        args.minlen = args.maxlen = args.prod = 1;
+        args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+        if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+                error = XFS_ERROR(ENOSPC);
+                goto error0;
+        }
+        error = xfs_alloc_vextent(&args);
+        if (error)
+                goto error0;
+        if (args.fsbno == NULLFSBLOCK && args.minleft) {
+                /*
+                 * Could not find an AG with enough free space to satisfy
+                 * a full btree split.  Try again without minleft and if
+                 * successful activate the lowspace algorithm.
+                 */
+                args.fsbno = 0;
+                args.type = XFS_ALLOCTYPE_FIRST_AG;
+                args.minleft = 0;
+                error = xfs_alloc_vextent(&args);
+                if (error)
+                        goto error0;
+                cur->bc_private.b.flist->xbf_low = 1;
+        }
+        if (args.fsbno == NULLFSBLOCK) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        ASSERT(args.len == 1);
+        cur->bc_private.b.firstblock = args.fsbno;
+        cur->bc_private.b.allocated++;
+        cur->bc_private.b.ip->i_d.di_nblocks++;
+        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+                        XFS_TRANS_DQ_BCOUNT, 1L);
+        new->l = cpu_to_be64(args.fsbno);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+ error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+STATIC int
+xfs_bmbt_free_block(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        struct xfs_trans        *tp = cur->bc_tp;
+        xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+        xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+        ip->i_d.di_nblocks--;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_binval(tp, bp);
+        return 0;
+}
+STATIC int
+xfs_bmbt_get_minrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level == cur->bc_nlevels - 1) {
+                struct xfs_ifork        *ifp;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                    cur->bc_private.b.whichfork);
+                return xfs_bmbt_maxrecs(cur->bc_mp,
+                                        ifp->if_broot_bytes, level == 0) / 2;
+        }
+        return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+int
+xfs_bmbt_get_maxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level == cur->bc_nlevels - 1) {
+                struct xfs_ifork        *ifp;
+                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                    cur->bc_private.b.whichfork);
+                return xfs_bmbt_maxrecs(cur->bc_mp,
+                                        ifp->if_broot_bytes, level == 0);
+        }
+        return cur->bc_mp->m_bmap_dmxr[level != 0];
+}
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+        struct xfs_btree_cur    *cur,
+        int                     level)
+{
+        if (level != cur->bc_nlevels - 1)
+                return cur->bc_mp->m_bmap_dmxr[level != 0];
+        return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
+                                level == 0);
+}
+STATIC void
+xfs_bmbt_init_key_from_rec(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        key->bmbt.br_startoff =
+                cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+STATIC void
+xfs_bmbt_init_rec_from_key(
+        union xfs_btree_key     *key,
+        union xfs_btree_rec     *rec)
+{
+        ASSERT(key->bmbt.br_startoff != 0);
+        xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+                               0, 0, XFS_EXT_NORM);
+}
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        ptr->l = 0;
+}
+STATIC __int64_t
+xfs_bmbt_key_diff(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key)
+{
+        return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+                                      cur->bc_rec.b.br_startoff;
+}
+#ifdef DEBUG
+STATIC int
+xfs_bmbt_keys_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *k1,
+        union xfs_btree_key     *k2)
+{
+        return be64_to_cpu(k1->bmbt.br_startoff) <
+                be64_to_cpu(k2->bmbt.br_startoff);
+}
+STATIC int
+xfs_bmbt_recs_inorder(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *r1,
+        union xfs_btree_rec     *r2)
+{
+        return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+                xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+                xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif  /* DEBUG */
+#ifdef XFS_BTREE_TRACE
+ktrace_t        *xfs_bmbt_trace_buf;
+STATIC void
+xfs_bmbt_trace_enter(
+        struct xfs_btree_cur    *cur,
+        const char              *func,
+        char                    *s,
+        int                     type,
+        int                     line,
+        __psunsigned_t          a0,
+        __psunsigned_t          a1,
+        __psunsigned_t          a2,
+        __psunsigned_t          a3,
+        __psunsigned_t          a4,
+        __psunsigned_t          a5,
+        __psunsigned_t          a6,
+        __psunsigned_t          a7,
+        __psunsigned_t          a8,
+        __psunsigned_t          a9,
+        __psunsigned_t          a10)
+{
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        int                     whichfork = cur->bc_private.b.whichfork;
+        ktrace_enter(xfs_bmbt_trace_buf,
+                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+                (void *)func, (void *)s, (void *)ip, (void *)cur,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)a8, (void *)a9, (void *)a10);
+        ktrace_enter(ip->i_btrace,
+                (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+                (void *)func, (void *)s, (void *)ip, (void *)cur,
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+                (void *)a8, (void *)a9, (void *)a10);
+}
+STATIC void
+xfs_bmbt_trace_cursor(
+        struct xfs_btree_cur    *cur,
+        __uint32_t              *s0,
+        __uint64_t              *l0,
+        __uint64_t              *l1)
+{
+        struct xfs_bmbt_rec_host r;
+        xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+        *s0 = (cur->bc_nlevels << 24) |
+              (cur->bc_private.b.flags << 16) |
+               cur->bc_private.b.allocated;
+        *l0 = r.l0;
+        *l1 = r.l1;
+}
+STATIC void
+xfs_bmbt_trace_key(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
+        __uint64_t              *l0,
+        __uint64_t              *l1)
+{
+        *l0 = be64_to_cpu(key->bmbt.br_startoff);
+        *l1 = 0;
+}
+STATIC void
+xfs_bmbt_trace_record(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        __uint64_t              *l0,
+        __uint64_t              *l1,
+        __uint64_t              *l2)
+{
+        struct xfs_bmbt_irec    irec;
+        xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+        *l0 = irec.br_startoff;
+        *l1 = irec.br_startblock;
+        *l2 = irec.br_blockcount;
+}
+#endif /* XFS_BTREE_TRACE */
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+        .rec_len                = sizeof(xfs_bmbt_rec_t),
+        .key_len                = sizeof(xfs_bmbt_key_t),
+        .dup_cursor             = xfs_bmbt_dup_cursor,
+        .update_cursor          = xfs_bmbt_update_cursor,
+        .alloc_block            = xfs_bmbt_alloc_block,
+        .free_block             = xfs_bmbt_free_block,
+        .get_maxrecs            = xfs_bmbt_get_maxrecs,
+        .get_minrecs            = xfs_bmbt_get_minrecs,
+        .get_dmaxrecs           = xfs_bmbt_get_dmaxrecs,
+        .init_key_from_rec      = xfs_bmbt_init_key_from_rec,
+        .init_rec_from_key      = xfs_bmbt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
+        .key_diff               = xfs_bmbt_key_diff,
+#ifdef DEBUG
+        .keys_inorder           = xfs_bmbt_keys_inorder,
+        .recs_inorder           = xfs_bmbt_recs_inorder,
+#endif
+#ifdef XFS_BTREE_TRACE
+        .trace_enter            = xfs_bmbt_trace_enter,
+        .trace_cursor           = xfs_bmbt_trace_cursor,
+        .trace_key              = xfs_bmbt_trace_key,
+        .trace_record           = xfs_bmbt_trace_record,
+#endif
+};
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *                          /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+        struct xfs_mount        *mp,            /* file system mount point */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_inode        *ip,            /* inode owning the btree */
+        int                     whichfork)      /* data or attr fork */
+{
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        struct xfs_btree_cur    *cur;
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+        cur->bc_tp = tp;
+        cur->bc_mp = mp;
+        cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+        cur->bc_btnum = XFS_BTNUM_BMAP;
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
+        cur->bc_ops = &xfs_bmbt_ops;
+        cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+        cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+        cur->bc_private.b.ip = ip;
+        cur->bc_private.b.firstblock = NULLFSBLOCK;
+        cur->bc_private.b.flist = NULL;
+        cur->bc_private.b.allocated = 0;
+        cur->bc_private.b.flags = 0;
+        cur->bc_private.b.whichfork = whichfork;
+        return cur;
+}
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+        struct xfs_mount        *mp,
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+        if (leaf)
+                return blocklen / sizeof(xfs_bmbt_rec_t);
+        return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+        struct xfs_mount        *mp,
+        int                     blocklen,
+        int                     leaf)
+{
+        blocklen -= sizeof(xfs_bmdr_block_t);
+        if (leaf)
+                return blocklen / sizeof(xfs_bmdr_rec_t);
+        return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cd0d4b4bb816..a4555abb6622 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -21,9 +21,10 @@
 #define XFS_BMAP_MAGIC  0x424d4150      /* 'BMAP' */
 struct xfs_btree_cur;
-struct xfs_btree_lblock;
+struct xfs_btree_block;
 struct xfs_mount;
 struct xfs_inode;
+struct xfs_trans;
 /*
 * Bmap root header, on-disk form only.
@@ -145,71 +146,60 @@ typedef struct xfs_bmbt_key {
 /* btree pointer type */
 typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
-/* btree block header type */
+/*
-typedef struct xfs_btree_lblock xfs_bmbt_block_t;
+ * Btree block header size depends on a superblock flag.
+ *
-#define XFS_BUF_TO_BMBT_BLOCK(bp)       ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp))
+ * (not quite yet, but soon)
+ */
-#define XFS_BMAP_RBLOCK_DSIZE(lev,cur)  ((cur)->bc_private.b.forksize)
+#define XFS_BMBT_BLOCK_LEN(mp)  XFS_BTREE_LBLOCK_LEN
-#define XFS_BMAP_RBLOCK_ISIZE(lev,cur)  \
-        ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
-                    (cur)->bc_private.b.whichfork)->if_broot_bytes)
+        ((xfs_bmbt_rec_t *) \
+                ((char *)(block) + \
-#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+                 ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
-                XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
-                        xfs_bmdr, (lev) == 0) : \
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
-                ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
+        ((xfs_bmbt_key_t *) \
-#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
+                ((char *)(block) + \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
-                        XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
+                 ((index) - 1) * sizeof(xfs_bmbt_key_t)))
-                                xfs_bmbt, (lev) == 0) : \
-                        ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_bmbt_ptr_t *) \
-#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
+                ((char *)(block) + \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+                 XFS_BMBT_BLOCK_LEN(mp) + \
-                        XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\
+                 (maxrecs) * sizeof(xfs_bmbt_key_t) + \
-                                xfs_bmdr, (lev) == 0) : \
+                 ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
-                        ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
+#define XFS_BMDR_REC_ADDR(block, index) \
-        (((lev) == (cur)->bc_nlevels - 1 ? \
+        ((xfs_bmdr_rec_t *) \
-                        XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
+                ((char *)(block) + \
-                                xfs_bmbt, (lev) == 0) : \
+                 sizeof(struct xfs_bmdr_block) + \
-                        ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
+                 ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
-#define XFS_BMAP_REC_DADDR(bb,i,cur)    (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
+#define XFS_BMDR_KEY_ADDR(block, index) \
+        ((xfs_bmdr_key_t *) \
-#define XFS_BMAP_REC_IADDR(bb,i,cur)    (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
+                ((char *)(block) + \
+                 sizeof(struct xfs_bmdr_block) + \
-#define XFS_BMAP_KEY_DADDR(bb,i,cur)    \
+                 ((index) - 1) * sizeof(xfs_bmdr_key_t)))
-        (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
-#define XFS_BMAP_KEY_IADDR(bb,i,cur)    \
+        ((xfs_bmdr_ptr_t *) \
-        (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
+                ((char *)(block) + \
+                 sizeof(struct xfs_bmdr_block) + \
-#define XFS_BMAP_PTR_DADDR(bb,i,cur)    \
+                 (maxrecs) * sizeof(xfs_bmdr_key_t) + \
-        (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(   \
+                 ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
-                                be16_to_cpu((bb)->bb_level), cur)))
-#define XFS_BMAP_PTR_IADDR(bb,i,cur)    \
-        (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(   \
-                                be16_to_cpu((bb)->bb_level), cur)))
 /*
 * These are to be used when we know the size of the block and
 * we don't have a cursor.
 */
-#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
-        (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i))
+        XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
-#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
-        (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i))
-#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
-        (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
-#define XFS_BMAP_BROOT_NUMRECS(bb)      be16_to_cpu((bb)->bb_numrecs)
-#define XFS_BMAP_BROOT_MAXRECS(sz)      XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
 #define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
-        (int)(sizeof(xfs_bmbt_block_t) + \
+        (int)(XFS_BTREE_LBLOCK_LEN + \
               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
 #define XFS_BMAP_BROOT_SPACE(bb) \
@@ -223,42 +213,12 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t;
 */
 #define XFS_BM_MAXLEVELS(mp,w)          ((mp)->m_bm_maxlevels[(w)])
-#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
-        (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \
-         be16_to_cpu((bb)->bb_level) == level && \
-         be16_to_cpu((bb)->bb_numrecs) > 0 && \
-         be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0])
-#ifdef __KERNEL__
-#if defined(XFS_BMBT_TRACE)
-/*
- * Trace buffer entry types.
- */
-#define XFS_BMBT_KTRACE_ARGBI   1
-#define XFS_BMBT_KTRACE_ARGBII  2
-#define XFS_BMBT_KTRACE_ARGFFFI 3
-#define XFS_BMBT_KTRACE_ARGI    4
-#define XFS_BMBT_KTRACE_ARGIFK  5
-#define XFS_BMBT_KTRACE_ARGIFR  6
-#define XFS_BMBT_KTRACE_ARGIK   7
-#define XFS_BMBT_KTRACE_CUR     8
-#define XFS_BMBT_TRACE_SIZE     4096    /* size of global trace buffer */
-#define XFS_BMBT_KTRACE_SIZE    32      /* size of per-inode trace buffer */
-extern ktrace_t *xfs_bmbt_trace_buf;
-#endif
 /*
 * Prototypes for xfs_bmap.c to call.
 */
-extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int);
+extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
-extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *);
+                        struct xfs_btree_block *, int);
-extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *);
 extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur,
-                                                int, struct xfs_buf **bpp);
 extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
 extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
 extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
@@ -268,22 +228,6 @@ extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
 extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *);
-extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
-extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int,
-                                int);
-extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t,
-                                xfs_fsblock_t, xfs_filblks_t, int *);
-extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t,
-                                xfs_fsblock_t, xfs_filblks_t, int *);
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat);
 extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
@@ -296,10 +240,15 @@ extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int);
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
-extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t,
+                        xfs_bmdr_block_t *, int);
-                                xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t);
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_inode *, int);
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index cc593a84c345..7ed59267420d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -34,7 +34,9 @@
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_error.h"
@@ -50,135 +52,33 @@ const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
        XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
 };
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int                              /* number of records fitting in block */
-xfs_btree_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_block_t       *block) /* generic btree block pointer */
-{
-        switch (cur->bc_btnum) {
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
-                return (int)XFS_ALLOC_BLOCK_MAXRECS(
-                                be16_to_cpu(block->bb_h.bb_level), cur);
-        case XFS_BTNUM_BMAP:
-                return (int)XFS_BMAP_BLOCK_IMAXRECS(
-                                be16_to_cpu(block->bb_h.bb_level), cur);
-        case XFS_BTNUM_INO:
-                return (int)XFS_INOBT_BLOCK_MAXRECS(
-                                be16_to_cpu(block->bb_h.bb_level), cur);
-        default:
-                ASSERT(0);
-                return 0;
-        }
-}
-/*
- * External routines.
- */
-#ifdef DEBUG
-/*
- * Debug routine: check that block header is ok.
- */
-void
-xfs_btree_check_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_block_t       *block, /* generic btree block pointer */
-        int                     level,  /* level of the btree block */
-        xfs_buf_t               *bp)    /* buffer containing block, if any */
-{
-        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
-                xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
-                        bp);
-        else
-                xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
-                        bp);
-}
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-        xfs_btnum_t     btnum,          /* btree identifier */
-        void            *ak1,           /* pointer to left (lower) key */
-        void            *ak2)           /* pointer to right (higher) key */
-{
-        switch (btnum) {
-        case XFS_BTNUM_BNO: {
-                xfs_alloc_key_t *k1;
-                xfs_alloc_key_t *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock));
-                break;
-            }
-        case XFS_BTNUM_CNT: {
-                xfs_alloc_key_t *k1;
-                xfs_alloc_key_t *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) ||
-                       (k1->ar_blockcount == k2->ar_blockcount &&
-                        be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)));
-                break;
-            }
-        case XFS_BTNUM_BMAP: {
-                xfs_bmbt_key_t  *k1;
-                xfs_bmbt_key_t  *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff));
-                break;
-            }
-        case XFS_BTNUM_INO: {
-                xfs_inobt_key_t *k1;
-                xfs_inobt_key_t *k2;
-                k1 = ak1;
-                k2 = ak2;
-                ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino));
-                break;
-            }
-        default:
-                ASSERT(0);
-        }
-}
-#endif  /* DEBUG */
-/*
+STATIC int                              /* error (0 or EFSCORRUPTED) */
- * Checking routine: check that long form block header is ok.
- */
-/* ARGSUSED */
-int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_btree_lblock_t      *block, /* btree long form block pointer */
+        struct xfs_btree_block  *block, /* btree long form block pointer */
        int                     level,  /* level of the btree block */
-        xfs_buf_t               *bp)    /* buffer for block, if any */
+        struct xfs_buf          *bp)    /* buffer for block, if any */
 {
        int                     lblock_ok; /* block passes checks */
-        xfs_mount_t             *mp;    /* file system mount point */
+        struct xfs_mount        *mp;    /* file system mount point */
        mp = cur->bc_mp;
        lblock_ok =
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                        xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+                        cur->bc_ops->get_maxrecs(cur, level) &&
-                block->bb_leftsib &&
+                block->bb_u.l.bb_leftsib &&
-                (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO ||
+                (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO ||
-                 XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) &&
+                 XFS_FSB_SANITY_CHECK(mp,
-                block->bb_rightsib &&
+                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
-                (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO ||
+                block->bb_u.l.bb_rightsib &&
-                 XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib)));
+                (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO ||
-        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+                 XFS_FSB_SANITY_CHECK(mp,
+                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
+        if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+                        XFS_ERRTAG_BTREE_CHECK_LBLOCK,
                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
                if (bp)
                        xfs_buftrace("LBTREE ERROR", bp);
@@ -189,98 +89,15 @@ xfs_btree_check_lblock(
        return 0;
 }
-/*
+STATIC int                              /* error (0 or EFSCORRUPTED) */
- * Checking routine: check that (long) pointer is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_dfsbno_t    ptr,            /* btree block disk address */
-        int             level)          /* btree block level */
-{
-        xfs_mount_t     *mp;            /* file system mount point */
-        mp = cur->bc_mp;
-        XFS_WANT_CORRUPTED_RETURN(
-                level > 0 &&
-                ptr != NULLDFSBNO &&
-                XFS_FSB_SANITY_CHECK(mp, ptr));
-        return 0;
-}
-#ifdef DEBUG
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-        xfs_btnum_t     btnum,          /* btree identifier */
-        void            *ar1,           /* pointer to left (lower) record */
-        void            *ar2)           /* pointer to right (higher) record */
-{
-        switch (btnum) {
-        case XFS_BTNUM_BNO: {
-                xfs_alloc_rec_t *r1;
-                xfs_alloc_rec_t *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(be32_to_cpu(r1->ar_startblock) +
-                       be32_to_cpu(r1->ar_blockcount) <=
-                       be32_to_cpu(r2->ar_startblock));
-                break;
-            }
-        case XFS_BTNUM_CNT: {
-                xfs_alloc_rec_t *r1;
-                xfs_alloc_rec_t *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) ||
-                       (r1->ar_blockcount == r2->ar_blockcount &&
-                        be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock)));
-                break;
-            }
-        case XFS_BTNUM_BMAP: {
-                xfs_bmbt_rec_t  *r1;
-                xfs_bmbt_rec_t  *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(xfs_bmbt_disk_get_startoff(r1) +
-                       xfs_bmbt_disk_get_blockcount(r1) <=
-                       xfs_bmbt_disk_get_startoff(r2));
-                break;
-            }
-        case XFS_BTNUM_INO: {
-                xfs_inobt_rec_t *r1;
-                xfs_inobt_rec_t *r2;
-                r1 = ar1;
-                r2 = ar2;
-                ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <=
-                       be32_to_cpu(r2->ir_startino));
-                break;
-            }
-        default:
-                ASSERT(0);
-        }
-}
-#endif  /* DEBUG */
-/*
- * Checking routine: check that block header is ok.
- */
-/* ARGSUSED */
-int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_btree_sblock_t      *block, /* btree short form block pointer */
+        struct xfs_btree_block  *block, /* btree short form block pointer */
        int                     level,  /* level of the btree block */
-        xfs_buf_t               *bp)    /* buffer containing block */
+        struct xfs_buf          *bp)    /* buffer containing block */
 {
-        xfs_buf_t               *agbp;  /* buffer for ag. freespace struct */
+        struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
-        xfs_agf_t               *agf;   /* ag. freespace structure */
+        struct xfs_agf          *agf;   /* ag. freespace structure */
        xfs_agblock_t           agflen; /* native ag. freespace length */
        int                     sblock_ok; /* block passes checks */
@@ -291,13 +108,13 @@ xfs_btree_check_sblock(
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                        xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
+                        cur->bc_ops->get_maxrecs(cur, level) &&
-                (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK ||
+                (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK ||
-                 be32_to_cpu(block->bb_leftsib) < agflen) &&
+                 be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
-                block->bb_leftsib &&
+                block->bb_u.s.bb_leftsib &&
-                (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK ||
+                (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK ||
-                 be32_to_cpu(block->bb_rightsib) < agflen) &&
+                 be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
-                block->bb_rightsib;
+                block->bb_u.s.bb_rightsib;
        if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
@@ -311,27 +128,78 @@ xfs_btree_check_sblock(
 }
 /*
- * Checking routine: check that (short) pointer is ok.
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_btree_block  *block, /* generic btree block pointer */
+        int                     level,  /* level of the btree block */
+        struct xfs_buf          *bp)    /* buffer containing block, if any */
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return xfs_btree_check_lblock(cur, block, level, bp);
+        else
+                return xfs_btree_check_sblock(cur, block, level, bp);
+}
+/*
+ * Check that (long) pointer is ok.
 */
 int                                     /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_dfsbno_t            bno,    /* btree block disk address */
+        int                     level)  /* btree block level */
+{
+        XFS_WANT_CORRUPTED_RETURN(
+                level > 0 &&
+                bno != NULLDFSBNO &&
+                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+        return 0;
+}
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int                              /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sptr(
-        xfs_btree_cur_t *cur,           /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agblock_t   ptr,            /* btree block disk address */
+        xfs_agblock_t           bno,    /* btree block disk address */
-        int             level)          /* btree block level */
+        int                     level)  /* btree block level */
 {
-        xfs_buf_t       *agbp;          /* buffer for ag. freespace struct */
+        xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
-        xfs_agf_t       *agf;           /* ag. freespace structure */
-        agbp = cur->bc_private.a.agbp;
-        agf = XFS_BUF_TO_AGF(agbp);
        XFS_WANT_CORRUPTED_RETURN(
                level > 0 &&
-                ptr != NULLAGBLOCK && ptr != 0 &&
+                bno != NULLAGBLOCK &&
-                ptr < be32_to_cpu(agf->agf_length));
+                bno != 0 &&
+                bno < agblocks);
        return 0;
 }
 /*
+ * Check that block ptr is ok.
+ */
+STATIC int                              /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        union xfs_btree_ptr     *ptr,   /* btree block disk address */
+        int                     index,  /* offset from ptr to check */
+        int                     level)  /* btree block level */
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                return xfs_btree_check_lptr(cur,
+                                be64_to_cpu((&ptr->l)[index]), level);
+        } else {
+                return xfs_btree_check_sptr(cur,
+                                be32_to_cpu((&ptr->s)[index]), level);
+        }
+}
+#endif
+/*
 * Delete the btree cursor.
 */
 void
@@ -387,16 +255,17 @@ xfs_btree_dup_cursor(
        tp = cur->bc_tp;
        mp = cur->bc_mp;
        /*
         * Allocate a new cursor like the old one.
         */
-        new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
+        new = cur->bc_ops->dup_cursor(cur);
-                cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
-                cur->bc_private.b.whichfork);
        /*
         * Copy the record currently in the cursor.
         */
        new->bc_rec = cur->bc_rec;
        /*
         * For each level current, re-get the buffer and copy the ptr value.
         */
@@ -416,46 +285,174 @@ xfs_btree_dup_cursor(
                } else
                        new->bc_bufs[i] = NULL;
        }
-        /*
-         * For bmap btrees, copy the firstblock, flist, and flags values,
-         * since init cursor doesn't get them.
-         */
-        if (new->bc_btnum == XFS_BTNUM_BMAP) {
-                new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
-                new->bc_private.b.flist = cur->bc_private.b.flist;
-                new->bc_private.b.flags = cur->bc_private.b.flags;
-        }
        *ncur = new;
        return 0;
 }
 /*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ *
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *              +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                XFS_BTREE_LBLOCK_LEN :
+                XFS_BTREE_SBLOCK_LEN;
+}
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+        return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                sizeof(__be64) : sizeof(__be32);
+}
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n)
+{
+        return xfs_btree_block_len(cur) +
+                (n - 1) * cur->bc_ops->rec_len;
+}
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n)
+{
+        return xfs_btree_block_len(cur) +
+                (n - 1) * cur->bc_ops->key_len;
+}
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        int                     level)
+{
+        return xfs_btree_block_len(cur) +
+                cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+                (n - 1) * xfs_btree_ptr_len(cur);
+}
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        return (union xfs_btree_rec *)
+                ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        return (union xfs_btree_key *)
+                ((char *)block + xfs_btree_key_offset(cur, n));
+}
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+        struct xfs_btree_cur    *cur,
+        int                     n,
+        struct xfs_btree_block  *block)
+{
+        int                     level = xfs_btree_get_level(block);
+        ASSERT(block->bb_level != 0);
+        return (union xfs_btree_ptr *)
+                ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+/*
+ * Get a the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+/*
 * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
+ * This may be an inode btree root or from a buffer.
 */
-STATIC xfs_btree_block_t *              /* generic btree block pointer */
+STATIC struct xfs_btree_block *         /* generic btree block pointer */
 xfs_btree_get_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
        int                     level,  /* level in btree */
-        xfs_buf_t               **bpp)  /* buffer containing the block */
+        struct xfs_buf          **bpp)  /* buffer containing the block */
-{
+{
-        xfs_btree_block_t       *block; /* return value */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-        xfs_buf_t               *bp;    /* return buffer */
+            (level == cur->bc_nlevels - 1)) {
-        xfs_ifork_t             *ifp;   /* inode fork pointer */
+                *bpp = NULL;
-        int                     whichfork; /* data or attr fork */
+                return xfs_btree_get_iroot(cur);
-        if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
-                whichfork = cur->bc_private.b.whichfork;
-                ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
-                block = (xfs_btree_block_t *)ifp->if_broot;
-                bp = NULL;
-        } else {
-                bp = cur->bc_bufs[level];
-                block = XFS_BUF_TO_BLOCK(bp);
        }
-        ASSERT(block != NULL);
-        *bpp = bp;
+        *bpp = cur->bc_bufs[level];
-        return block;
+        return XFS_BUF_TO_BLOCK(*bpp);
 }
 /*
@@ -505,97 +502,6 @@ xfs_btree_get_bufs(
 }
 /*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B) or inodes (I).
- */
-xfs_btree_cur_t *                       /* new btree cursor */
-xfs_btree_init_cursor(
-        xfs_mount_t     *mp,            /* file system mount point */
-        xfs_trans_t     *tp,            /* transaction pointer */
-        xfs_buf_t       *agbp,          /* (A only) buffer for agf structure */
-                                        /* (I only) buffer for agi structure */
-        xfs_agnumber_t  agno,           /* (AI only) allocation group number */
-        xfs_btnum_t     btnum,          /* btree identifier */
-        xfs_inode_t     *ip,            /* (B only) inode owning the btree */
-        int             whichfork)      /* (B only) data or attr fork */
-{
-        xfs_agf_t       *agf;           /* (A) allocation group freespace */
-        xfs_agi_t       *agi;           /* (I) allocation group inodespace */
-        xfs_btree_cur_t *cur;           /* return value */
-        xfs_ifork_t     *ifp;           /* (I) inode fork pointer */
-        int             nlevels=0;      /* number of levels in the btree */
-        ASSERT(xfs_btree_cur_zone != NULL);
-        /*
-         * Allocate a new cursor.
-         */
-        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-        /*
-         * Deduce the number of btree levels from the arguments.
-         */
-        switch (btnum) {
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
-                agf = XFS_BUF_TO_AGF(agbp);
-                nlevels = be32_to_cpu(agf->agf_levels[btnum]);
-                break;
-        case XFS_BTNUM_BMAP:
-                ifp = XFS_IFORK_PTR(ip, whichfork);
-                nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-                break;
-        case XFS_BTNUM_INO:
-                agi = XFS_BUF_TO_AGI(agbp);
-                nlevels = be32_to_cpu(agi->agi_level);
-                break;
-        default:
-                ASSERT(0);
-        }
-        /*
-         * Fill in the common fields.
-         */
-        cur->bc_tp = tp;
-        cur->bc_mp = mp;
-        cur->bc_nlevels = nlevels;
-        cur->bc_btnum = btnum;
-        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        /*
-         * Fill in private fields.
-         */
-        switch (btnum) {
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
-                /*
-                 * Allocation btree fields.
-                 */
-                cur->bc_private.a.agbp = agbp;
-                cur->bc_private.a.agno = agno;
-                break;
-        case XFS_BTNUM_INO:
-                /*
-                 * Inode allocation btree fields.
-                 */
-                cur->bc_private.a.agbp = agbp;
-                cur->bc_private.a.agno = agno;
-                break;
-        case XFS_BTNUM_BMAP:
-                /*
-                 * Bmap btree fields.
-                 */
-                cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
-                cur->bc_private.b.ip = ip;
-                cur->bc_private.b.firstblock = NULLFSBLOCK;
-                cur->bc_private.b.flist = NULL;
-                cur->bc_private.b.allocated = 0;
-                cur->bc_private.b.flags = 0;
-                cur->bc_private.b.whichfork = whichfork;
-                break;
-        default:
-                ASSERT(0);
-        }
-        return cur;
-}
-/*
 * Check for the cursor referring to the last block at the given level.
 */
 int                                     /* 1=is last block, 0=not last block */
@@ -603,12 +509,12 @@ xfs_btree_islastblock(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to check */
 {
-        xfs_btree_block_t       *block; /* generic btree block pointer */
+        struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
        block = xfs_btree_get_block(cur, level, &bp);
        xfs_btree_check_block(cur, block, level, bp);
-        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
                return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
        else
                return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
@@ -618,12 +524,12 @@ xfs_btree_islastblock(
 * Change the cursor to point to the first record at the given level.
 * Other levels are unaffected.
 */
-int                                     /* success=1, failure=0 */
+STATIC int                              /* success=1, failure=0 */
 xfs_btree_firstrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-        xfs_btree_block_t       *block; /* generic btree block pointer */
+        struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
        /*
@@ -634,7 +540,7 @@ xfs_btree_firstrec(
        /*
         * It's empty, there is no such record.
         */
-        if (!block->bb_h.bb_numrecs)
+        if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to 1, that's the first record/key.
@@ -647,12 +553,12 @@ xfs_btree_firstrec(
 * Change the cursor to point to the last record in the current block
 * at the given level.  Other levels are unaffected.
 */
-int                                     /* success=1, failure=0 */
+STATIC int                              /* success=1, failure=0 */
 xfs_btree_lastrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-        xfs_btree_block_t       *block; /* generic btree block pointer */
+        struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
        /*
@@ -663,12 +569,12 @@ xfs_btree_lastrec(
        /*
         * It's empty, there is no such record.
         */
-        if (!block->bb_h.bb_numrecs)
+        if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to numrecs, that's the last record/key.
         */
-        cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs);
+        cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
        return 1;
 }
@@ -817,66 +723,84 @@ xfs_btree_reada_bufs(
        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
+STATIC int
+xfs_btree_readahead_lblock(
+        struct xfs_btree_cur    *cur,
+        int                     lr,
+        struct xfs_btree_block  *block)
+{
+        int                     rval = 0;
+        xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+        xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+                xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+                rval++;
+        }
+        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+                xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+                rval++;
+        }
+        return rval;
+}
+STATIC int
+xfs_btree_readahead_sblock(
+        struct xfs_btree_cur    *cur,
+        int                     lr,
+        struct xfs_btree_block *block)
+{
+        int                     rval = 0;
+        xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+        xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                     left, 1);
+                rval++;
+        }
+        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                     right, 1);
+                rval++;
+        }
+        return rval;
+}
 /*
 * Read-ahead btree blocks, at the given level.
 * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
 */
-int
+STATIC int
-xfs_btree_readahead_core(
+xfs_btree_readahead(
-        xfs_btree_cur_t         *cur,           /* btree cursor */
+        struct xfs_btree_cur    *cur,           /* btree cursor */
        int                     lev,            /* level in btree */
        int                     lr)             /* left/right bits */
 {
-        xfs_alloc_block_t       *a;
+        struct xfs_btree_block  *block;
-        xfs_bmbt_block_t        *b;
-        xfs_inobt_block_t       *i;
+        /*
-        int                     rval = 0;
+         * No readahead needed if we are at the root level and the
+         * btree root is stored in the inode.
+         */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (lev == cur->bc_nlevels - 1))
+                return 0;
+        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+                return 0;
-        ASSERT(cur->bc_bufs[lev] != NULL);
        cur->bc_ra[lev] |= lr;
-        switch (cur->bc_btnum) {
+        block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
-        case XFS_BTNUM_BNO:
-        case XFS_BTNUM_CNT:
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-                a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
+                return xfs_btree_readahead_lblock(cur, lr, block);
-                if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) {
+        return xfs_btree_readahead_sblock(cur, lr, block);
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(a->bb_leftsib), 1);
-                        rval++;
-                }
-                if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(a->bb_rightsib), 1);
-                        rval++;
-                }
-                break;
-        case XFS_BTNUM_BMAP:
-                b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
-                if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) {
-                        xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1);
-                        rval++;
-                }
-                if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) {
-                        xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1);
-                        rval++;
-                }
-                break;
-        case XFS_BTNUM_INO:
-                i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
-                if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(i->bb_leftsib), 1);
-                        rval++;
-                }
-                if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
-                        xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                be32_to_cpu(i->bb_rightsib), 1);
-                        rval++;
-                }
-                break;
-        default:
-                ASSERT(0);
-        }
-        return rval;
 }
 /*
@@ -889,7 +813,7 @@ xfs_btree_setbuf(
        int                     lev,    /* level in btree */
        xfs_buf_t               *bp)    /* new buffer to set */
 {
-        xfs_btree_block_t       *b;     /* btree block */
+        struct xfs_btree_block  *b;     /* btree block */
        xfs_buf_t               *obp;   /* old buffer pointer */
        obp = cur->bc_bufs[lev];
@@ -900,7 +824,7 @@ xfs_btree_setbuf(
        if (!bp)
                return;
        b = XFS_BUF_TO_BLOCK(bp);
-        if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
                if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
                if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
@@ -912,3 +836,2855 @@ xfs_btree_setbuf(
                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
        }
 }
+STATIC int
+xfs_btree_ptr_is_null(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                return be64_to_cpu(ptr->l) == NULLFSBLOCK;
+        else
+                return be32_to_cpu(ptr->s) == NULLAGBLOCK;
+}
+STATIC void
+xfs_btree_set_ptr_null(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                ptr->l = cpu_to_be64(NULLFSBLOCK);
+        else
+                ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        union xfs_btree_ptr     *ptr,
+        int                     lr)
+{
+        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (lr == XFS_BB_RIGHTSIB)
+                        ptr->l = block->bb_u.l.bb_rightsib;
+                else
+                        ptr->l = block->bb_u.l.bb_leftsib;
+        } else {
+                if (lr == XFS_BB_RIGHTSIB)
+                        ptr->s = block->bb_u.s.bb_rightsib;
+                else
+                        ptr->s = block->bb_u.s.bb_leftsib;
+        }
+}
+STATIC void
+xfs_btree_set_sibling(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        union xfs_btree_ptr     *ptr,
+        int                     lr)
+{
+        ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                if (lr == XFS_BB_RIGHTSIB)
+                        block->bb_u.l.bb_rightsib = ptr->l;
+                else
+                        block->bb_u.l.bb_leftsib = ptr->l;
+        } else {
+                if (lr == XFS_BB_RIGHTSIB)
+                        block->bb_u.s.bb_rightsib = ptr->s;
+                else
+                        block->bb_u.s.bb_leftsib = ptr->s;
+        }
+}
+STATIC void
+xfs_btree_init_block(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     numrecs,
+        struct xfs_btree_block  *new)   /* new block */
+{
+        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+        new->bb_level = cpu_to_be16(level);
+        new->bb_numrecs = cpu_to_be16(numrecs);
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+                new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+        } else {
+                new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+        }
+}
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updateѕ to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+        struct xfs_btree_cur    *cur,
+        struct xfs_btree_block  *block,
+        int                     level)
+{
+        union xfs_btree_ptr     ptr;
+        if (level > 0)
+                return 0;
+        if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+                return 0;
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &ptr))
+                return 0;
+        return 1;
+}
+STATIC void
+xfs_btree_buf_to_ptr(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+                ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+                                        XFS_BUF_ADDR(bp)));
+        else {
+                ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
+                                        XFS_BUF_ADDR(bp)));
+        }
+}
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
+                return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+        } else {
+                ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+                ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK);
+                return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+                                        be32_to_cpu(ptr->s));
+        }
+}
+STATIC void
+xfs_btree_set_refs(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp)
+{
+        switch (cur->bc_btnum) {
+        case XFS_BTNUM_BNO:
+        case XFS_BTNUM_CNT:
+                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+                break;
+        case XFS_BTNUM_INO:
+                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+                break;
+        case XFS_BTNUM_BMAP:
+                XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+                break;
+        default:
+                ASSERT(0);
+        }
+}
+STATIC int
+xfs_btree_get_buf_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     flags,
+        struct xfs_btree_block  **block,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        xfs_daddr_t             d;
+        /* need to sort out how callers deal with failures first */
+        ASSERT(!(flags & XFS_BUF_TRYLOCK));
+        d = xfs_btree_ptr_to_daddr(cur, ptr);
+        *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+                                 mp->m_bsize, flags);
+        ASSERT(*bpp);
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        *block = XFS_BUF_TO_BLOCK(*bpp);
+        return 0;
+}
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     level,
+        int                     flags,
+        struct xfs_btree_block  **block,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = cur->bc_mp;
+        xfs_daddr_t             d;
+        int                     error;
+        /* need to sort out how callers deal with failures first */
+        ASSERT(!(flags & XFS_BUF_TRYLOCK));
+        d = xfs_btree_ptr_to_daddr(cur, ptr);
+        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+                                   mp->m_bsize, flags, bpp);
+        if (error)
+                return error;
+        ASSERT(*bpp != NULL);
+        ASSERT(!XFS_BUF_GETERROR(*bpp));
+        xfs_btree_set_refs(cur, *bpp);
+        *block = XFS_BUF_TO_BLOCK(*bpp);
+        error = xfs_btree_check_block(cur, *block, level, *bpp);
+        if (error)
+                xfs_trans_brelse(cur->bc_tp, *bpp);
+        return error;
+}
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *dst_key,
+        union xfs_btree_key     *src_key,
+        int                     numkeys)
+{
+        ASSERT(numkeys >= 0);
+        memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *dst_rec,
+        union xfs_btree_rec     *src_rec,
+        int                     numrecs)
+{
+        ASSERT(numrecs >= 0);
+        memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *dst_ptr,
+        union xfs_btree_ptr     *src_ptr,
+        int                     numptrs)
+{
+        ASSERT(numptrs >= 0);
+        memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *key,
+        int                     dir,
+        int                     numkeys)
+{
+        char                    *dst_key;
+        ASSERT(numkeys >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+        memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        int                     dir,
+        int                     numrecs)
+{
+        char                    *dst_rec;
+        ASSERT(numrecs >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+        memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     *ptr,
+        int                     dir,
+        int                     numptrs)
+{
+        char                    *dst_ptr;
+        ASSERT(numptrs >= 0);
+        ASSERT(dir == 1 || dir == -1);
+        dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+        memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     first,
+        int                     last)
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        if (bp) {
+                xfs_trans_log_buf(cur->bc_tp, bp,
+                                  xfs_btree_key_offset(cur, first),
+                                  xfs_btree_key_offset(cur, last + 1) - 1);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                                xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *bp,
+        int                     first,
+        int                     last)
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        xfs_trans_log_buf(cur->bc_tp, bp,
+                          xfs_btree_rec_offset(cur, first),
+                          xfs_btree_rec_offset(cur, last + 1) - 1);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_buf          *bp,    /* buffer containing btree block */
+        int                     first,  /* index of first pointer to log */
+        int                     last)   /* index of last pointer to log */
+{
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+        if (bp) {
+                struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+                int                     level = xfs_btree_get_level(block);
+                xfs_trans_log_buf(cur->bc_tp, bp,
+                                xfs_btree_ptr_offset(cur, first, level),
+                                xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        struct xfs_buf          *bp,    /* buffer containing btree block */
+        int                     fields) /* mask of fields: XFS_BB_... */
+{
+        int                     first;  /* first byte offset logged */
+        int                     last;   /* last byte offset logged */
+        static const short      soffsets[] = {  /* table of offsets (short) */
+                offsetof(struct xfs_btree_block, bb_magic),
+                offsetof(struct xfs_btree_block, bb_level),
+                offsetof(struct xfs_btree_block, bb_numrecs),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+                offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+                XFS_BTREE_SBLOCK_LEN
+        };
+        static const short      loffsets[] = {  /* table of offsets (long) */
+                offsetof(struct xfs_btree_block, bb_magic),
+                offsetof(struct xfs_btree_block, bb_level),
+                offsetof(struct xfs_btree_block, bb_numrecs),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+                offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+                XFS_BTREE_LBLOCK_LEN
+        };
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+        if (bp) {
+                xfs_btree_offsets(fields,
+                                  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                                        loffsets : soffsets,
+                                  XFS_BB_NUM_BITS, &first, &last);
+                xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+        } else {
+                xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                        xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_btree_increment(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_block  *block;
+        union xfs_btree_ptr     ptr;
+        struct xfs_buf          *bp;
+        int                     error;          /* error return value */
+        int                     lev;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        /* Read-ahead to the right at this level. */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+        /* Get a pointer to the btree block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* We're done if we remain in the block after the increment. */
+        if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+                goto out1;
+        /* Fail if we just went off the right edge of the tree. */
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        if (xfs_btree_ptr_is_null(cur, &ptr))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, increment);
+        /*
+         * March up the tree incrementing pointers.
+         * Stop when we don't go off the right edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                block = xfs_btree_get_block(cur, lev, &bp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, block, lev, bp);
+                if (error)
+                        goto error0;
+#endif
+                if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+                        break;
+                /* Read-ahead the right block for the next loop. */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+        }
+        /*
+         * If we went off the root then we are either seriously
+         * confused or have the tree root in an inode.
+         */
+        if (lev == cur->bc_nlevels) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                        goto out0;
+                ASSERT(0);
+                error = EFSCORRUPTED;
+                goto error0;
+        }
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+                union xfs_btree_ptr     *ptrp;
+                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+                error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                        0, &block, &bp);
+                if (error)
+                        goto error0;
+                xfs_btree_setbuf(cur, lev, bp);
+                cur->bc_ptrs[lev] = 1;
+        }
+out1:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                             /* error */
+xfs_btree_decrement(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        struct xfs_btree_block  *block;
+        xfs_buf_t               *bp;
+        int                     error;          /* error return value */
+        int                     lev;
+        union xfs_btree_ptr     ptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        ASSERT(level < cur->bc_nlevels);
+        /* Read-ahead to the left at this level. */
+        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+        /* We're done if we remain in the block after the decrement. */
+        if (--cur->bc_ptrs[level] > 0)
+                goto out1;
+        /* Get a pointer to the btree block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Fail if we just went off the left edge of the tree. */
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+        if (xfs_btree_ptr_is_null(cur, &ptr))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, decrement);
+        /*
+         * March up the tree decrementing pointers.
+         * Stop when we don't go off the left edge of a block.
+         */
+        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+                if (--cur->bc_ptrs[lev] > 0)
+                        break;
+                /* Read-ahead the left block for the next loop. */
+                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+        }
+        /*
+         * If we went off the root then we are seriously confused.
+         * or the root of the tree is in an inode.
+         */
+        if (lev == cur->bc_nlevels) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                        goto out0;
+                ASSERT(0);
+                error = EFSCORRUPTED;
+                goto error0;
+        }
+        ASSERT(lev < cur->bc_nlevels);
+        /*
+         * Now walk back down the tree, fixing up the cursor's buffer
+         * pointers and key numbers.
+         */
+        for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+                union xfs_btree_ptr     *ptrp;
+                ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+                error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                        0, &block, &bp);
+                if (error)
+                        goto error0;
+                xfs_btree_setbuf(cur, lev, bp);
+                cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+        }
+out1:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+STATIC int
+xfs_btree_lookup_get_block(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level in the btree */
+        union xfs_btree_ptr     *pp,    /* ptr to btree block */
+        struct xfs_btree_block  **blkp) /* return btree block */
+{
+        struct xfs_buf          *bp;    /* buffer pointer for btree block */
+        int                     error = 0;
+        /* special case the root block if in an inode */
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level == cur->bc_nlevels - 1)) {
+                *blkp = xfs_btree_get_iroot(cur);
+                return 0;
+        }
+        /*
+         * If the old buffer at this level for the disk address we are
+         * looking for re-use it.
+         *
+         * Otherwise throw it away and get a new one.
+         */
+        bp = cur->bc_bufs[level];
+        if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+                *blkp = XFS_BUF_TO_BLOCK(bp);
+                return 0;
+        }
+        error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+        if (error)
+                return error;
+        xfs_btree_setbuf(cur, level, bp);
+        return 0;
+}
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     keyno,
+        struct xfs_btree_block  *block,
+        union xfs_btree_key     *kp)
+{
+        if (level == 0) {
+                cur->bc_ops->init_key_from_rec(kp,
+                                xfs_btree_rec_addr(cur, keyno, block));
+                return kp;
+        }
+        return xfs_btree_key_addr(cur, keyno, block);
+}
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+int                                     /* error */
+xfs_btree_lookup(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_lookup_t            dir,    /* <=, ==, or >= */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* current btree block */
+        __int64_t               diff;   /* difference for the current key */
+        int                     error;  /* error return value */
+        int                     keyno;  /* current key number */
+        int                     level;  /* level in the btree */
+        union xfs_btree_ptr     *pp;    /* ptr to btree block */
+        union xfs_btree_ptr     ptr;    /* ptr to btree block */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, dir);
+        XFS_BTREE_STATS_INC(cur, lookup);
+        block = NULL;
+        keyno = 0;
+        /* initialise start pointer from cursor */
+        cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+        pp = &ptr;
+        /*
+         * Iterate over each level in the btree, starting at the root.
+         * For each level above the leaves, find the key we need, based
+         * on the lookup record, then follow the corresponding block
+         * pointer down to the next level.
+         */
+        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+                /* Get the block we need to do the lookup on. */
+                error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+                if (error)
+                        goto error0;
+                if (diff == 0) {
+                        /*
+                         * If we already had a key match at a higher level, we
+                         * know we need to use the first entry in this block.
+                         */
+                        keyno = 1;
+                } else {
+                        /* Otherwise search this block. Do a binary search. */
+                        int     high;   /* high entry number */
+                        int     low;    /* low entry number */
+                        /* Set low and high entry numbers, 1-based. */
+                        low = 1;
+                        high = xfs_btree_get_numrecs(block);
+                        if (!high) {
+                                /* Block is empty, must be an empty leaf. */
+                                ASSERT(level == 0 && cur->bc_nlevels == 1);
+                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                                *stat = 0;
+                                return 0;
+                        }
+                        /* Binary search the block. */
+                        while (low <= high) {
+                                union xfs_btree_key     key;
+                                union xfs_btree_key     *kp;
+                                XFS_BTREE_STATS_INC(cur, compare);
+                                /* keyno is average of low and high. */
+                                keyno = (low + high) >> 1;
+                                /* Get current search key */
+                                kp = xfs_lookup_get_search_key(cur, level,
+                                                keyno, block, &key);
+                                /*
+                                 * Compute difference to get next direction:
+                                 *  - less than, move right
+                                 *  - greater than, move left
+                                 *  - equal, we're done
+                                 */
+                                diff = cur->bc_ops->key_diff(cur, kp);
+                                if (diff < 0)
+                                        low = keyno + 1;
+                                else if (diff > 0)
+                                        high = keyno - 1;
+                                else
+                                        break;
+                        }
+                }
+                /*
+                 * If there are more levels, set up for the next level
+                 * by getting the block number and filling in the cursor.
+                 */
+                if (level > 0) {
+                        /*
+                         * If we moved left, need the previous key number,
+                         * unless there isn't one.
+                         */
+                        if (diff > 0 && --keyno < 1)
+                                keyno = 1;
+                        pp = xfs_btree_ptr_addr(cur, keyno, block);
+#ifdef DEBUG
+                        error = xfs_btree_check_ptr(cur, pp, 0, level);
+                        if (error)
+                                goto error0;
+#endif
+                        cur->bc_ptrs[level] = keyno;
+                }
+        }
+        /* Done with the search. See if we need to adjust the results. */
+        if (dir != XFS_LOOKUP_LE && diff < 0) {
+                keyno++;
+                /*
+                 * If ge search and we went off the end of the block, but it's
+                 * not the last block, we're in the wrong block.
+                 */
+                xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+                if (dir == XFS_LOOKUP_GE &&
+                    keyno > xfs_btree_get_numrecs(block) &&
+                    !xfs_btree_ptr_is_null(cur, &ptr)) {
+                        int     i;
+                        cur->bc_ptrs[0] = keyno;
+                        error = xfs_btree_increment(cur, 0, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                        *stat = 1;
+                        return 0;
+                }
+        } else if (dir == XFS_LOOKUP_LE && diff > 0)
+                keyno--;
+        cur->bc_ptrs[0] = keyno;
+        /* Return if we succeeded or not. */
+        if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+                *stat = 0;
+        else if (dir != XFS_LOOKUP_EQ || diff == 0)
+                *stat = 1;
+        else
+                *stat = 0;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_key     *keyp,
+        int                     level)
+{
+        struct xfs_btree_block  *block;
+        struct xfs_buf          *bp;
+        union xfs_btree_key     *kp;
+        int                     ptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+        ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+        /*
+         * Go up the tree from this level toward the root.
+         * At each level, update the key value to the value input.
+         * Stop when we reach a level where the cursor isn't pointing
+         * at the first entry in the block.
+         */
+        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+                int             error;
+#endif
+                block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, block, level, bp);
+                if (error) {
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                        return error;
+                }
+#endif
+                ptr = cur->bc_ptrs[level];
+                kp = xfs_btree_key_addr(cur, ptr, block);
+                xfs_btree_copy_keys(cur, kp, keyp, 1);
+                xfs_btree_log_keys(cur, bp, ptr, ptr);
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec)
+{
+        struct xfs_btree_block  *block;
+        struct xfs_buf          *bp;
+        int                     error;
+        int                     ptr;
+        union xfs_btree_rec     *rp;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGR(cur, rec);
+        /* Pick up the current block. */
+        block = xfs_btree_get_block(cur, 0, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, 0, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Get the address of the rec to be updated. */
+        ptr = cur->bc_ptrs[0];
+        rp = xfs_btree_rec_addr(cur, ptr, block);
+        /* Fill in the new contents and log them. */
+        xfs_btree_copy_recs(cur, rp, rec, 1);
+        xfs_btree_log_recs(cur, bp, ptr, ptr);
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, 0)) {
+                cur->bc_ops->update_lastrec(cur, block, rec,
+                                            ptr, LASTREC_UPDATE);
+        }
+        /* Updating first rec in leaf. Pass new key value up to our parent. */
+        if (ptr == 1) {
+                union xfs_btree_key     key;
+                cur->bc_ops->init_key_from_rec(&key, rec);
+                error = xfs_btree_updkey(cur, &key, 1);
+                if (error)
+                        goto error0;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_btree_lshift(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_key     key;            /* btree key */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        int                     lrecs;          /* left record count */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        int                     rrecs;          /* right record count */
+        union xfs_btree_ptr     lptr;           /* left btree pointer */
+        union xfs_btree_key     *rkp = NULL;    /* right btree key */
+        union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
+        union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
+        int                     error;          /* error return value */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            level == cur->bc_nlevels - 1)
+                goto out0;
+        /* Set up variables for this block as "right". */
+        right = xfs_btree_get_block(cur, level, &rbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, right, level, rbp);
+        if (error)
+                goto error0;
+#endif
+        /* If we've got no left sibling then we can't shift an entry left. */
+        xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+        if (xfs_btree_ptr_is_null(cur, &lptr))
+                goto out0;
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        if (cur->bc_ptrs[level] <= 1)
+                goto out0;
+        /* Set up the left neighbor as "left". */
+        error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+        if (error)
+                goto error0;
+        /* If it's full, it can't take another entry. */
+        lrecs = xfs_btree_get_numrecs(left);
+        if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+                goto out0;
+        rrecs = xfs_btree_get_numrecs(right);
+        /*
+         * We add one entry to the left side and remove one for the right side.
+         * Accout for it here, the changes will be updated on disk and logged
+         * later.
+         */
+        lrecs++;
+        rrecs--;
+        XFS_BTREE_STATS_INC(cur, lshift);
+        XFS_BTREE_STATS_ADD(cur, moves, 1);
+        /*
+         * If non-leaf, copy a key and a ptr to the left block.
+         * Log the changes to the left block.
+         */
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                lkp = xfs_btree_key_addr(cur, lrecs, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, rpp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                xfs_btree_copy_keys(cur, lkp, rkp, 1);
+                xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+                xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+                xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+                ASSERT(cur->bc_ops->keys_inorder(cur,
+                        xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                lrp = xfs_btree_rec_addr(cur, lrecs, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, lrp, rrp, 1);
+                xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+                ASSERT(cur->bc_ops->recs_inorder(cur,
+                        xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+        }
+        xfs_btree_set_numrecs(left, lrecs);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+        xfs_btree_set_numrecs(right, rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+        /*
+         * Slide the contents of right down one entry.
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+        if (level > 0) {
+                /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+                int                     i;              /* loop index */
+                for (i = 0; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_shift_keys(cur,
+                                xfs_btree_key_addr(cur, 2, right),
+                                -1, rrecs);
+                xfs_btree_shift_ptrs(cur,
+                                xfs_btree_ptr_addr(cur, 2, right),
+                                -1, rrecs);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+        } else {
+                /* It's a leaf. operate on records */
+                xfs_btree_shift_recs(cur,
+                        xfs_btree_rec_addr(cur, 2, right),
+                        -1, rrecs);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs);
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                cur->bc_ops->init_key_from_rec(&key,
+                        xfs_btree_rec_addr(cur, 1, right));
+                rkp = &key;
+        }
+        /* Update the parent key values of right. */
+        error = xfs_btree_updkey(cur, rkp, level + 1);
+        if (error)
+                goto error0;
+        /* Slide the cursor value left one. */
+        cur->bc_ptrs[level]--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                      /* error */
+xfs_btree_rshift(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_key     key;            /* btree key */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+        union xfs_btree_ptr     rptr;           /* right block pointer */
+        union xfs_btree_key     *rkp;           /* right btree key */
+        int                     rrecs;          /* right record count */
+        int                     lrecs;          /* left record count */
+        int                     error;          /* error return value */
+        int                     i;              /* loop counter */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level == cur->bc_nlevels - 1))
+                goto out0;
+        /* Set up variables for this block as "left". */
+        left = xfs_btree_get_block(cur, level, &lbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, left, level, lbp);
+        if (error)
+                goto error0;
+#endif
+        /* If we've got no right sibling then we can't shift an entry right. */
+        xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+        if (xfs_btree_ptr_is_null(cur, &rptr))
+                goto out0;
+        /*
+         * If the cursor entry is the one that would be moved, don't
+         * do it... it's too complicated.
+         */
+        lrecs = xfs_btree_get_numrecs(left);
+        if (cur->bc_ptrs[level] >= lrecs)
+                goto out0;
+        /* Set up the right neighbor as "right". */
+        error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+        if (error)
+                goto error0;
+        /* If it's full, it can't take another entry. */
+        rrecs = xfs_btree_get_numrecs(right);
+        if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, rshift);
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        /*
+         * Make a hole at the start of the right neighbor block, then
+         * copy the last left block entry to the hole.
+         */
+        if (level > 0) {
+                /* It's a nonleaf. make a hole in the keys and ptrs */
+                union xfs_btree_key     *lkp;
+                union xfs_btree_ptr     *lpp;
+                union xfs_btree_ptr     *rpp;
+                lkp = xfs_btree_key_addr(cur, lrecs, left);
+                lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = rrecs - 1; i >= 0; i--) {
+                        error = xfs_btree_check_ptr(cur, rpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+                xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, lpp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                /* Now put the new data in, and log it. */
+                xfs_btree_copy_keys(cur, rkp, lkp, 1);
+                xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+                ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+                        xfs_btree_key_addr(cur, 2, right)));
+        } else {
+                /* It's a leaf. make a hole in the records */
+                union xfs_btree_rec     *lrp;
+                union xfs_btree_rec     *rrp;
+                lrp = xfs_btree_rec_addr(cur, lrecs, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+                /* Now put the new data in, and log it. */
+                xfs_btree_copy_recs(cur, rrp, lrp, 1);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+                cur->bc_ops->init_key_from_rec(&key, rrp);
+                rkp = &key;
+                ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+                        xfs_btree_rec_addr(cur, 2, right)));
+        }
+        /*
+         * Decrement and log left's numrecs, bump and log right's numrecs.
+         */
+        xfs_btree_set_numrecs(left, --lrecs);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+        xfs_btree_set_numrecs(right, ++rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+        /*
+         * Using a temporary cursor, update the parent key values of the
+         * block on the right.
+         */
+        error = xfs_btree_dup_cursor(cur, &tcur);
+        if (error)
+                goto error0;
+        i = xfs_btree_lastrec(tcur, level);
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        error = xfs_btree_increment(tcur, level, &i);
+        if (error)
+                goto error1;
+        error = xfs_btree_updkey(tcur, rkp, level + 1);
+        if (error)
+                goto error1;
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+error1:
+        XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int                                      /* error */
+xfs_btree_split(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        union xfs_btree_ptr     *ptrp,
+        union xfs_btree_key     *key,
+        struct xfs_btree_cur    **curp,
+        int                     *stat)          /* success/failure */
+{
+        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
+        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+        struct xfs_btree_block  *rrblock;       /* right-right btree block */
+        int                     lrecs;
+        int                     rrecs;
+        int                     src_index;
+        int                     error;          /* error return value */
+#ifdef DEBUG
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+        XFS_BTREE_STATS_INC(cur, split);
+        /* Set up left block (current one). */
+        left = xfs_btree_get_block(cur, level, &lbp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, left, level, lbp);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0)
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Set up the new block as "right". */
+        error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+        if (error)
+                goto error0;
+        /* Fill in the btree header for the new right block. */
+        xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+        /*
+         * Split the entries between the old and the new block evenly.
+         * Make sure that if there's an odd number of entries now, that
+         * each new block will have the same number of entries.
+         */
+        lrecs = xfs_btree_get_numrecs(left);
+        rrecs = lrecs / 2;
+        if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+                rrecs++;
+        src_index = (lrecs - rrecs + 1);
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        /*
+         * Copy btree block entries from the left block over to the
+         * new block, the right. Update the right block and log the
+         * changes.
+         */
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                union xfs_btree_key     *rkp;   /* right btree key */
+                union xfs_btree_ptr     *rpp;   /* right address pointer */
+                lkp = xfs_btree_key_addr(cur, src_index, left);
+                lpp = xfs_btree_ptr_addr(cur, src_index, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = src_index; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, lpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+                xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+                xfs_btree_log_keys(cur, rbp, 1, rrecs);
+                xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+                /* Grab the keys to the entries moved to the right block */
+                xfs_btree_copy_keys(cur, key, rkp, 1);
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                union xfs_btree_rec     *rrp;   /* right record pointer */
+                lrp = xfs_btree_rec_addr(cur, src_index, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+                xfs_btree_log_recs(cur, rbp, 1, rrecs);
+                cur->bc_ops->init_key_from_rec(key,
+                        xfs_btree_rec_addr(cur, 1, right));
+        }
+        /*
+         * Find the left block number by looking in the buffer.
+         * Adjust numrecs, sibling pointers.
+         */
+        xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+        xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+        xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+        xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+        lrecs -= rrecs;
+        xfs_btree_set_numrecs(left, lrecs);
+        xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+        xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /*
+         * If there's a block to the new block's right, make that block
+         * point back to right instead of to left.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+                error = xfs_btree_read_buf_block(cur, &rrptr, level,
+                                                        0, &rrblock, &rrbp);
+                if (error)
+                        goto error0;
+                xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        /*
+         * If the cursor is really in the right block, move it there.
+         * If it's just pointing past the last entry in left, then we'll
+         * insert there, so don't change anything in that case.
+         */
+        if (cur->bc_ptrs[level] > lrecs + 1) {
+                xfs_btree_setbuf(cur, level, rbp);
+                cur->bc_ptrs[level] -= lrecs;
+        }
+        /*
+         * If there are more levels, we'll need another cursor which refers
+         * the right block, no matter where this cursor was.
+         */
+        if (level + 1 < cur->bc_nlevels) {
+                error = xfs_btree_dup_cursor(cur, curp);
+                if (error)
+                        goto error0;
+                (*curp)->bc_ptrs[level + 1]++;
+        }
+        *ptrp = rptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int                                             /* error */
+xfs_btree_new_iroot(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     *logflags,      /* logging flags for inode */
+        int                     *stat)          /* return status - 0 fail */
+{
+        struct xfs_buf          *cbp;           /* buffer for cblock */
+        struct xfs_btree_block  *block;         /* btree block */
+        struct xfs_btree_block  *cblock;        /* child btree block */
+        union xfs_btree_key     *ckp;           /* child key pointer */
+        union xfs_btree_ptr     *cpp;           /* child ptr pointer */
+        union xfs_btree_key     *kp;            /* pointer to btree key */
+        union xfs_btree_ptr     *pp;            /* pointer to block addr */
+        union xfs_btree_ptr     nptr;           /* new block addr */
+        int                     level;          /* btree level */
+        int                     error;          /* error return code */
+#ifdef DEBUG
+        int                     i;              /* loop counter */
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, newroot);
+        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+        level = cur->bc_nlevels - 1;
+        block = xfs_btree_get_iroot(cur);
+        pp = xfs_btree_ptr_addr(cur, 1, block);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                return 0;
+        }
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Copy the root into a real block. */
+        error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+        if (error)
+                goto error0;
+        memcpy(cblock, block, xfs_btree_block_len(cur));
+        be16_add_cpu(&block->bb_level, 1);
+        xfs_btree_set_numrecs(block, 1);
+        cur->bc_nlevels++;
+        cur->bc_ptrs[level + 1] = 1;
+        kp = xfs_btree_key_addr(cur, 1, block);
+        ckp = xfs_btree_key_addr(cur, 1, cblock);
+        xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+        for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+                error = xfs_btree_check_ptr(cur, pp, i, level);
+                if (error)
+                        goto error0;
+        }
+#endif
+        xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+#ifdef DEBUG
+        error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+        xfs_iroot_realloc(cur->bc_private.b.ip,
+                          1 - xfs_btree_get_numrecs(cblock),
+                          cur->bc_private.b.whichfork);
+        xfs_btree_setbuf(cur, level, cbp);
+        /*
+         * Do all this logging at the end so that
+         * the root is at the right level.
+         */
+        xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+        xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+        xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+        *logflags |=
+                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+        *stat = 1;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int                              /* error */
+xfs_btree_new_root(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* one half of the old root block */
+        struct xfs_buf          *bp;    /* buffer containing block */
+        int                     error;  /* error return value */
+        struct xfs_buf          *lbp;   /* left buffer pointer */
+        struct xfs_btree_block  *left;  /* left btree block */
+        struct xfs_buf          *nbp;   /* new (root) buffer */
+        struct xfs_btree_block  *new;   /* new (root) btree block */
+        int                     nptr;   /* new value for key index, 1 or 2 */
+        struct xfs_buf          *rbp;   /* right buffer pointer */
+        struct xfs_btree_block  *right; /* right btree block */
+        union xfs_btree_ptr     rptr;
+        union xfs_btree_ptr     lptr;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, newroot);
+        /* initialise our start point from the cursor */
+        cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+        /* Allocate the new block. If we can't do it, we're toast. Give up. */
+        error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+        if (error)
+                goto error0;
+        if (*stat == 0)
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, alloc);
+        /* Set up the new block. */
+        error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+        if (error)
+                goto error0;
+        /* Set the root in the holding structure  increasing the level by 1. */
+        cur->bc_ops->set_root(cur, &lptr, 1);
+        /*
+         * At the previous root level there are now two blocks: the old root,
+         * and the new block generated when it was split.  We don't know which
+         * one the cursor is pointing at, so we set up variables "left" and
+         * "right" for each case.
+         */
+        block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+        if (error)
+                goto error0;
+#endif
+        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+                /* Our block is left, pick up the right block. */
+                lbp = bp;
+                xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+                left = block;
+                error = xfs_btree_read_buf_block(cur, &rptr,
+                                        cur->bc_nlevels - 1, 0, &right, &rbp);
+                if (error)
+                        goto error0;
+                bp = rbp;
+                nptr = 1;
+        } else {
+                /* Our block is right, pick up the left block. */
+                rbp = bp;
+                xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+                right = block;
+                xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+                error = xfs_btree_read_buf_block(cur, &lptr,
+                                        cur->bc_nlevels - 1, 0, &left, &lbp);
+                if (error)
+                        goto error0;
+                bp = lbp;
+                nptr = 2;
+        }
+        /* Fill in the new block's btree header and log it. */
+        xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+                        !xfs_btree_ptr_is_null(cur, &rptr));
+        /* Fill in the key data in the new root. */
+        if (xfs_btree_get_level(left) > 0) {
+                xfs_btree_copy_keys(cur,
+                                xfs_btree_key_addr(cur, 1, new),
+                                xfs_btree_key_addr(cur, 1, left), 1);
+                xfs_btree_copy_keys(cur,
+                                xfs_btree_key_addr(cur, 2, new),
+                                xfs_btree_key_addr(cur, 1, right), 1);
+        } else {
+                cur->bc_ops->init_key_from_rec(
+                                xfs_btree_key_addr(cur, 1, new),
+                                xfs_btree_rec_addr(cur, 1, left));
+                cur->bc_ops->init_key_from_rec(
+                                xfs_btree_key_addr(cur, 2, new),
+                                xfs_btree_rec_addr(cur, 1, right));
+        }
+        xfs_btree_log_keys(cur, nbp, 1, 2);
+        /* Fill in the pointer data in the new root. */
+        xfs_btree_copy_ptrs(cur,
+                xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+        xfs_btree_copy_ptrs(cur,
+                xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+        xfs_btree_log_ptrs(cur, nbp, 1, 2);
+        /* Fix up the cursor. */
+        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+        cur->bc_ptrs[cur->bc_nlevels] = nptr;
+        cur->bc_nlevels++;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 0;
+        return 0;
+}
+STATIC int
+xfs_btree_make_block_unfull(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* btree level */
+        int                     numrecs,/* # of recs in block */
+        int                     *oindex,/* old tree index */
+        int                     *index, /* new tree index */
+        union xfs_btree_ptr     *nptr,  /* new btree ptr */
+        struct xfs_btree_cur    **ncur, /* new btree cursor */
+        union xfs_btree_rec     *nrec,  /* new record */
+        int                     *stat)
+{
+        union xfs_btree_key     key;    /* new btree key value */
+        int                     error = 0;
+        if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            level == cur->bc_nlevels - 1) {
+                struct xfs_inode *ip = cur->bc_private.b.ip;
+                if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+                        /* A root block that can be made bigger. */
+                        xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+                } else {
+                        /* A root block that needs replacing */
+                        int     logflags = 0;
+                        error = xfs_btree_new_iroot(cur, &logflags, stat);
+                        if (error || *stat == 0)
+                                return error;
+                        xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+                }
+                return 0;
+        }
+        /* First, try shifting an entry to the right neighbor. */
+        error = xfs_btree_rshift(cur, level, stat);
+        if (error || *stat)
+                return error;
+        /* Next, try shifting an entry to the left neighbor. */
+        error = xfs_btree_lshift(cur, level, stat);
+        if (error)
+                return error;
+        if (*stat) {
+                *oindex = *index = cur->bc_ptrs[level];
+                return 0;
+        }
+        /*
+         * Next, try splitting the current block in half.
+         *
+         * If this works we have to re-set our variables because we
+         * could be in a different block now.
+         */
+        error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+        if (error || *stat == 0)
+                return error;
+        *index = cur->bc_ptrs[level];
+        cur->bc_ops->init_rec_from_key(&key, nrec);
+        return 0;
+}
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        int                     level,  /* level to insert record at */
+        union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
+        union xfs_btree_rec     *recp,  /* i/o: record data inserted */
+        struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
+        int                     *stat)  /* success/failure */
+{
+        struct xfs_btree_block  *block; /* btree block */
+        struct xfs_buf          *bp;    /* buffer for block */
+        union xfs_btree_key     key;    /* btree key */
+        union xfs_btree_ptr     nptr;   /* new block ptr */
+        struct xfs_btree_cur    *ncur;  /* new btree cursor */
+        union xfs_btree_rec     nrec;   /* new record count */
+        int                     optr;   /* old key/record index */
+        int                     ptr;    /* key/record index */
+        int                     numrecs;/* number of records */
+        int                     error;  /* error return value */
+#ifdef DEBUG
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+        ncur = NULL;
+        /*
+         * If we have an external root pointer, and we've made it to the
+         * root level, allocate a new root block and we're done.
+         */
+        if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+            (level >= cur->bc_nlevels)) {
+                error = xfs_btree_new_root(cur, stat);
+                xfs_btree_set_ptr_null(cur, ptrp);
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                return error;
+        }
+        /* If we're off the left edge, return failure. */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        /* Make a key out of the record data to be inserted, and save it. */
+        cur->bc_ops->init_key_from_rec(&key, recp);
+        optr = ptr;
+        XFS_BTREE_STATS_INC(cur, insrec);
+        /* Get pointers to the btree buffer and block. */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+        /* Check that the new entry is being inserted in the right place. */
+        if (ptr <= numrecs) {
+                if (level == 0) {
+                        ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+                                xfs_btree_rec_addr(cur, ptr, block)));
+                } else {
+                        ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+                                xfs_btree_key_addr(cur, ptr, block)));
+                }
+        }
+#endif
+        /*
+         * If the block is full, we can't insert the new entry until we
+         * make the block un-full.
+         */
+        xfs_btree_set_ptr_null(cur, &nptr);
+        if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+                error = xfs_btree_make_block_unfull(cur, level, numrecs,
+                                        &optr, &ptr, &nptr, &ncur, &nrec, stat);
+                if (error || *stat == 0)
+                        goto error0;
+        }
+        /*
+         * The current block may have changed if the block was
+         * previously full and we have just made space in it.
+         */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                return error;
+#endif
+        /*
+         * At this point we know there's room for our new entry in the block
+         * we're pointing at.
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+        if (level > 0) {
+                /* It's a nonleaf. make a hole in the keys and ptrs */
+                union xfs_btree_key     *kp;
+                union xfs_btree_ptr     *pp;
+                kp = xfs_btree_key_addr(cur, ptr, block);
+                pp = xfs_btree_ptr_addr(cur, ptr, block);
+#ifdef DEBUG
+                for (i = numrecs - ptr; i >= 0; i--) {
+                        error = xfs_btree_check_ptr(cur, pp, i, level);
+                        if (error)
+                                return error;
+                }
+#endif
+                xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+                xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+#ifdef DEBUG
+                error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+                if (error)
+                        goto error0;
+#endif
+                /* Now put the new data in, bump numrecs and log it. */
+                xfs_btree_copy_keys(cur, kp, &key, 1);
+                xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+                numrecs++;
+                xfs_btree_set_numrecs(block, numrecs);
+                xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+                xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+                if (ptr < numrecs) {
+                        ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+                                xfs_btree_key_addr(cur, ptr + 1, block)));
+                }
+#endif
+        } else {
+                /* It's a leaf. make a hole in the records */
+                union xfs_btree_rec             *rp;
+                rp = xfs_btree_rec_addr(cur, ptr, block);
+                xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+                /* Now put the new data in, bump numrecs and log it. */
+                xfs_btree_copy_recs(cur, rp, recp, 1);
+                xfs_btree_set_numrecs(block, ++numrecs);
+                xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+                if (ptr < numrecs) {
+                        ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+                                xfs_btree_rec_addr(cur, ptr + 1, block)));
+                }
+#endif
+        }
+        /* Log the new number of records in the btree header. */
+        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+        /* If we inserted at the start of a block, update the parents' keys. */
+        if (optr == 1) {
+                error = xfs_btree_updkey(cur, &key, level + 1);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, level)) {
+                cur->bc_ops->update_lastrec(cur, block, recp,
+                                            ptr, LASTREC_INSREC);
+        }
+        /*
+         * Return the new block number, if any.
+         * If there is one, give back a record value and a cursor too.
+         */
+        *ptrp = nptr;
+        if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+                *recp = nrec;
+                *curp = ncur;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+        struct xfs_btree_cur    *cur,
+        int                     *stat)
+{
+        int                     error;  /* error return value */
+        int                     i;      /* result value, 0 for failure */
+        int                     level;  /* current level number in btree */
+        union xfs_btree_ptr     nptr;   /* new block number (split result) */
+        struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
+        struct xfs_btree_cur    *pcur;  /* previous level's cursor */
+        union xfs_btree_rec     rec;    /* record to insert */
+        level = 0;
+        ncur = NULL;
+        pcur = cur;
+        xfs_btree_set_ptr_null(cur, &nptr);
+        cur->bc_ops->init_rec_from_cur(cur, &rec);
+        /*
+         * Loop going up the tree, starting at the leaf level.
+         * Stop when we don't get a split block, that must mean that
+         * the insert is finished with this level.
+         */
+        do {
+                /*
+                 * Insert nrec/nptr into this level of the tree.
+                 * Note if we fail, nptr will be null.
+                 */
+                error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+                if (error) {
+                        if (pcur != cur)
+                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                        goto error0;
+                }
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                level++;
+                /*
+                 * See if the cursor we just used is trash.
+                 * Can't trash the caller's cursor, but otherwise we should
+                 * if ncur is a new cursor or we're about to be done.
+                 */
+                if (pcur != cur &&
+                    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+                        /* Save the state from the cursor before we trash it */
+                        if (cur->bc_ops->update_cursor)
+                                cur->bc_ops->update_cursor(pcur, cur);
+                        cur->bc_nlevels = pcur->bc_nlevels;
+                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+                }
+                /* If we got a new cursor, switch to it. */
+                if (ncur) {
+                        pcur = ncur;
+                        ncur = NULL;
+                }
+        } while (!xfs_btree_ptr_is_null(cur, &nptr));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = i;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+int
+xfs_btree_kill_iroot(
+        struct xfs_btree_cur    *cur)
+{
+        int                     whichfork = cur->bc_private.b.whichfork;
+        struct xfs_inode        *ip = cur->bc_private.b.ip;
+        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+        struct xfs_btree_block  *block;
+        struct xfs_btree_block  *cblock;
+        union xfs_btree_key     *kp;
+        union xfs_btree_key     *ckp;
+        union xfs_btree_ptr     *pp;
+        union xfs_btree_ptr     *cpp;
+        struct xfs_buf          *cbp;
+        int                     level;
+        int                     index;
+        int                     numrecs;
+#ifdef DEBUG
+        union xfs_btree_ptr     ptr;
+        int                     i;
+#endif
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+        ASSERT(cur->bc_nlevels > 1);
+        /*
+         * Don't deal with the root block needs to be a leaf case.
+         * We're just going to turn the thing back into extents anyway.
+         */
+        level = cur->bc_nlevels - 1;
+        if (level == 1)
+                goto out0;
+        /*
+         * Give up if the root has multiple children.
+         */
+        block = xfs_btree_get_iroot(cur);
+        if (xfs_btree_get_numrecs(block) != 1)
+                goto out0;
+        cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+        numrecs = xfs_btree_get_numrecs(cblock);
+        /*
+         * Only do this if the next level will fit.
+         * Then the data must be copied up to the inode,
+         * instead of freeing the root you free the next level.
+         */
+        if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+                goto out0;
+        XFS_BTREE_STATS_INC(cur, killroot);
+#ifdef DEBUG
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+        xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+        ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+        index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+        if (index) {
+                xfs_iroot_realloc(cur->bc_private.b.ip, index,
+                                  cur->bc_private.b.whichfork);
+                block = ifp->if_broot;
+        }
+        be16_add_cpu(&block->bb_numrecs, index);
+        ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+        kp = xfs_btree_key_addr(cur, 1, block);
+        ckp = xfs_btree_key_addr(cur, 1, cblock);
+        xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+        pp = xfs_btree_ptr_addr(cur, 1, block);
+        cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+        for (i = 0; i < numrecs; i++) {
+                int             error;
+                error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+                if (error) {
+                        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                        return error;
+                }
+        }
+#endif
+        xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+        cur->bc_ops->free_block(cur, cbp);
+        XFS_BTREE_STATS_INC(cur, free);
+        cur->bc_bufs[level - 1] = NULL;
+        be16_add_cpu(&block->bb_level, -1);
+        xfs_trans_log_inode(cur->bc_tp, ip,
+                XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+        cur->bc_nlevels--;
+out0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        return 0;
+}
+STATIC int
+xfs_btree_dec_cursor(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     *stat)
+{
+        int                     error;
+        int                     i;
+        if (level > 0) {
+                error = xfs_btree_decrement(cur, level, &i);
+                if (error)
+                        return error;
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = 1;
+        return 0;
+}
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int                                      /* error */
+xfs_btree_delrec(
+        struct xfs_btree_cur    *cur,           /* btree cursor */
+        int                     level,          /* level removing record from */
+        int                     *stat)          /* fail/done/go-on */
+{
+        struct xfs_btree_block  *block;         /* btree block */
+        union xfs_btree_ptr     cptr;           /* current block ptr */
+        struct xfs_buf          *bp;            /* buffer for block */
+        int                     error;          /* error return value */
+        int                     i;              /* loop counter */
+        union xfs_btree_key     key;            /* storage for keyp */
+        union xfs_btree_key     *keyp = &key;   /* passed to the next level */
+        union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+        struct xfs_buf          *lbp;           /* left buffer pointer */
+        struct xfs_btree_block  *left;          /* left btree block */
+        int                     lrecs = 0;      /* left record count */
+        int                     ptr;            /* key/record index */
+        union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+        struct xfs_buf          *rbp;           /* right buffer pointer */
+        struct xfs_btree_block  *right;         /* right btree block */
+        struct xfs_btree_block  *rrblock;       /* right-right btree block */
+        struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+        int                     rrecs = 0;      /* right record count */
+        struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+        int                     numrecs;        /* temporary numrec count */
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_TRACE_ARGI(cur, level);
+        tcur = NULL;
+        /* Get the index of the entry being deleted, check for nothing there. */
+        ptr = cur->bc_ptrs[level];
+        if (ptr == 0) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        /* Get the buffer & block containing the record or key/ptr. */
+        block = xfs_btree_get_block(cur, level, &bp);
+        numrecs = xfs_btree_get_numrecs(block);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, level, bp);
+        if (error)
+                goto error0;
+#endif
+        /* Fail if we're off the end of the block. */
+        if (ptr > numrecs) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                *stat = 0;
+                return 0;
+        }
+        XFS_BTREE_STATS_INC(cur, delrec);
+        XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+        /* Excise the entries being deleted. */
+        if (level > 0) {
+                /* It's a nonleaf. operate on keys and ptrs */
+                union xfs_btree_key     *lkp;
+                union xfs_btree_ptr     *lpp;
+                lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+                lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+#ifdef DEBUG
+                for (i = 0; i < numrecs - ptr; i++) {
+                        error = xfs_btree_check_ptr(cur, lpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                if (ptr < numrecs) {
+                        xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+                        xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+                        xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+                        xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need to pass a
+                 * key up to the next level (updkey).
+                 */
+                if (ptr == 1)
+                        keyp = xfs_btree_key_addr(cur, 1, block);
+        } else {
+                /* It's a leaf. operate on records */
+                if (ptr < numrecs) {
+                        xfs_btree_shift_recs(cur,
+                                xfs_btree_rec_addr(cur, ptr + 1, block),
+                                -1, numrecs - ptr);
+                        xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+                }
+                /*
+                 * If it's the first record in the block, we'll need a key
+                 * structure to pass up to the next level (updkey).
+                 */
+                if (ptr == 1) {
+                        cur->bc_ops->init_key_from_rec(&key,
+                                        xfs_btree_rec_addr(cur, 1, block));
+                        keyp = &key;
+                }
+        }
+        /*
+         * Decrement and log the number of entries in the block.
+         */
+        xfs_btree_set_numrecs(block, --numrecs);
+        xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+        /*
+         * If we are tracking the last record in the tree and
+         * we are at the far right edge of the tree, update it.
+         */
+        if (xfs_btree_is_lastrec(cur, block, level)) {
+                cur->bc_ops->update_lastrec(cur, block, NULL,
+                                            ptr, LASTREC_DELREC);
+        }
+        /*
+         * We're at the root level.  First, shrink the root block in-memory.
+         * Try to get rid of the next level down.  If we can't then there's
+         * nothing left to do.
+         */
+        if (level == cur->bc_nlevels - 1) {
+                if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                        xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+                                          cur->bc_private.b.whichfork);
+                        error = xfs_btree_kill_iroot(cur);
+                        if (error)
+                                goto error0;
+                        error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                        *stat = 1;
+                        return 0;
+                }
+                /*
+                 * If this is the root level, and there's only one entry left,
+                 * and it's NOT the leaf level, then we can get rid of this
+                 * level.
+                 */
+                if (numrecs == 1 && level > 0) {
+                        union xfs_btree_ptr     *pp;
+                        /*
+                         * pp is still set to the first pointer in the block.
+                         * Make it the new root of the btree.
+                         */
+                        pp = xfs_btree_ptr_addr(cur, 1, block);
+                        error = cur->bc_ops->kill_root(cur, bp, level, pp);
+                        if (error)
+                                goto error0;
+                } else if (level > 0) {
+                        error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                }
+                *stat = 1;
+                return 0;
+        }
+        /*
+         * If we deleted the leftmost entry in the block, update the
+         * key values above us in the tree.
+         */
+        if (ptr == 1) {
+                error = xfs_btree_updkey(cur, keyp, level + 1);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * If the number of records remaining in the block is at least
+         * the minimum, we're done.
+         */
+        if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+                error = xfs_btree_dec_cursor(cur, level, stat);
+                if (error)
+                        goto error0;
+                return 0;
+        }
+        /*
+         * Otherwise, we have to move some records around to keep the
+         * tree balanced.  Look at the left and right sibling blocks to
+         * see if we can re-balance by moving only one record.
+         */
+        xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+        xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+        if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                /*
+                 * One child of root, need to get a chance to copy its contents
+                 * into the root and delete it. Can't go up to next level,
+                 * there's nothing to delete there.
+                 */
+                if (xfs_btree_ptr_is_null(cur, &rptr) &&
+                    xfs_btree_ptr_is_null(cur, &lptr) &&
+                    level == cur->bc_nlevels - 2) {
+                        error = xfs_btree_kill_iroot(cur);
+                        if (!error)
+                                error = xfs_btree_dec_cursor(cur, level, stat);
+                        if (error)
+                                goto error0;
+                        return 0;
+                }
+        }
+        ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+               !xfs_btree_ptr_is_null(cur, &lptr));
+        /*
+         * Duplicate the cursor so our btree manipulations here won't
+         * disrupt the next level up.
+         */
+        error = xfs_btree_dup_cursor(cur, &tcur);
+        if (error)
+                goto error0;
+        /*
+         * If there's a right sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+                /*
+                 * Move the temp cursor to the last entry in the next block.
+                 * Actually any entry but the first would suffice.
+                 */
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_btree_increment(tcur, level, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                i = xfs_btree_lastrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /* Grab a pointer to the block. */
+                right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(tcur, right, level, rbp);
+                if (error)
+                        goto error0;
+#endif
+                /* Grab the current block number, for future use. */
+                xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+                /*
+                 * If right block is full enough so that removing one entry
+                 * won't make it too empty, and left-shifting an entry out
+                 * of right to us works, we're done.
+                 */
+                if (xfs_btree_get_numrecs(right) - 1 >=
+                    cur->bc_ops->get_minrecs(tcur, level)) {
+                        error = xfs_btree_lshift(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        if (i) {
+                                ASSERT(xfs_btree_get_numrecs(block) >=
+                                       cur->bc_ops->get_minrecs(tcur, level));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                error = xfs_btree_dec_cursor(cur, level, stat);
+                                if (error)
+                                        goto error0;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference, and fix up the temp cursor to point
+                 * to our block again (last record).
+                 */
+                rrecs = xfs_btree_get_numrecs(right);
+                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                        i = xfs_btree_firstrec(tcur, level);
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        error = xfs_btree_decrement(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                }
+        }
+        /*
+         * If there's a left sibling, see if it's ok to shift an entry
+         * out of it.
+         */
+        if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                /*
+                 * Move the temp cursor to the first entry in the
+                 * previous block.
+                 */
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_btree_decrement(tcur, level, &i);
+                if (error)
+                        goto error0;
+                i = xfs_btree_firstrec(tcur, level);
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                /* Grab a pointer to the block. */
+                left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+                error = xfs_btree_check_block(cur, left, level, lbp);
+                if (error)
+                        goto error0;
+#endif
+                /* Grab the current block number, for future use. */
+                xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+                /*
+                 * If left block is full enough so that removing one entry
+                 * won't make it too empty, and right-shifting an entry out
+                 * of left to us works, we're done.
+                 */
+                if (xfs_btree_get_numrecs(left) - 1 >=
+                    cur->bc_ops->get_minrecs(tcur, level)) {
+                        error = xfs_btree_rshift(tcur, level, &i);
+                        if (error)
+                                goto error0;
+                        if (i) {
+                                ASSERT(xfs_btree_get_numrecs(block) >=
+                                       cur->bc_ops->get_minrecs(tcur, level));
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                tcur = NULL;
+                                if (level == 0)
+                                        cur->bc_ptrs[0]++;
+                                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                                *stat = 1;
+                                return 0;
+                        }
+                }
+                /*
+                 * Otherwise, grab the number of records in right for
+                 * future reference.
+                 */
+                lrecs = xfs_btree_get_numrecs(left);
+        }
+        /* Delete the temp cursor, we're done with it. */
+        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+        tcur = NULL;
+        /* If here, we need to do a join to keep the tree balanced. */
+        ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+        if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+            lrecs + xfs_btree_get_numrecs(block) <=
+                        cur->bc_ops->get_maxrecs(cur, level)) {
+                /*
+                 * Set "right" to be the starting block,
+                 * "left" to be the left neighbor.
+                 */
+                rptr = cptr;
+                right = block;
+                rbp = bp;
+                error = xfs_btree_read_buf_block(cur, &lptr, level,
+                                                        0, &left, &lbp);
+                if (error)
+                        goto error0;
+        /*
+         * If that won't work, see if we can join with the right neighbor block.
+         */
+        } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+                   rrecs + xfs_btree_get_numrecs(block) <=
+                        cur->bc_ops->get_maxrecs(cur, level)) {
+                /*
+                 * Set "left" to be the starting block,
+                 * "right" to be the right neighbor.
+                 */
+                lptr = cptr;
+                left = block;
+                lbp = bp;
+                error = xfs_btree_read_buf_block(cur, &rptr, level,
+                                                        0, &right, &rbp);
+                if (error)
+                        goto error0;
+        /*
+         * Otherwise, we can't fix the imbalance.
+         * Just return.  This is probably a logic error, but it's not fatal.
+         */
+        } else {
+                error = xfs_btree_dec_cursor(cur, level, stat);
+                if (error)
+                        goto error0;
+                return 0;
+        }
+        rrecs = xfs_btree_get_numrecs(right);
+        lrecs = xfs_btree_get_numrecs(left);
+        /*
+         * We're now going to join "left" and "right" by moving all the stuff
+         * in "right" to "left" and deleting "right".
+         */
+        XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+        if (level > 0) {
+                /* It's a non-leaf.  Move keys and pointers. */
+                union xfs_btree_key     *lkp;   /* left btree key */
+                union xfs_btree_ptr     *lpp;   /* left address pointer */
+                union xfs_btree_key     *rkp;   /* right btree key */
+                union xfs_btree_ptr     *rpp;   /* right address pointer */
+                lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+                lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+                rkp = xfs_btree_key_addr(cur, 1, right);
+                rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+                for (i = 1; i < rrecs; i++) {
+                        error = xfs_btree_check_ptr(cur, rpp, i, level);
+                        if (error)
+                                goto error0;
+                }
+#endif
+                xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+                xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+                xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+                xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        } else {
+                /* It's a leaf.  Move records.  */
+                union xfs_btree_rec     *lrp;   /* left record pointer */
+                union xfs_btree_rec     *rrp;   /* right record pointer */
+                lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+                rrp = xfs_btree_rec_addr(cur, 1, right);
+                xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+                xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+        }
+        XFS_BTREE_STATS_INC(cur, join);
+        /*
+         * Fix up the the number of records and right block pointer in the
+         * surviving block, and log it.
+         */
+        xfs_btree_set_numrecs(left, lrecs + rrecs);
+        xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+        xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+        xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+        /* If there is a right sibling, point it to the remaining block. */
+        xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+        if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+                error = xfs_btree_read_buf_block(cur, &cptr, level,
+                                                        0, &rrblock, &rrbp);
+                if (error)
+                        goto error0;
+                xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+                xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+        }
+        /* Free the deleted block. */
+        error = cur->bc_ops->free_block(cur, rbp);
+        if (error)
+                goto error0;
+        XFS_BTREE_STATS_INC(cur, free);
+        /*
+         * If we joined with the left neighbor, set the buffer in the
+         * cursor to the left block, and fix up the index.
+         */
+        if (bp != lbp) {
+                cur->bc_bufs[level] = lbp;
+                cur->bc_ptrs[level] += lrecs;
+                cur->bc_ra[level] = 0;
+        }
+        /*
+         * If we joined with the right neighbor and there's a level above
+         * us, increment the cursor at that level.
+         */
+        else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+                   (level + 1 < cur->bc_nlevels)) {
+                error = xfs_btree_increment(cur, level + 1, &i);
+                if (error)
+                        goto error0;
+        }
+        /*
+         * Readjust the ptr at this level if it's not a leaf, since it's
+         * still pointing at the deletion point, which makes the cursor
+         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+         * We can't use decrement because it would change the next level up.
+         */
+        if (level > 0)
+                cur->bc_ptrs[level]--;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        /* Return value means the next level up has something to do. */
+        *stat = 2;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        if (tcur)
+                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        return error;
+}
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                     /* error */
+xfs_btree_delete(
+        struct xfs_btree_cur    *cur,
+        int                     *stat)  /* success/failure */
+{
+        int                     error;  /* error return value */
+        int                     level;
+        int                     i;
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        /*
+         * Go up the tree, starting at leaf level.
+         *
+         * If 2 is returned then a join was done; go to the next level.
+         * Otherwise we are done.
+         */
+        for (level = 0, i = 2; i == 2; level++) {
+                error = xfs_btree_delrec(cur, level, &i);
+                if (error)
+                        goto error0;
+        }
+        if (i == 0) {
+                for (level = 1; level < cur->bc_nlevels; level++) {
+                        if (cur->bc_ptrs[level] == 0) {
+                                error = xfs_btree_decrement(cur, level, &i);
+                                if (error)
+                                        goto error0;
+                                break;
+                        }
+                }
+        }
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+        *stat = i;
+        return 0;
+error0:
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+        return error;
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_btree_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        union xfs_btree_rec     **recp, /* output: btree record */
+        int                     *stat)  /* output: success/failure */
+{
+        struct xfs_btree_block  *block; /* btree block */
+        struct xfs_buf          *bp;    /* buffer pointer */
+        int                     ptr;    /* record number */
+#ifdef DEBUG
+        int                     error;  /* error return value */
+#endif
+        ptr = cur->bc_ptrs[0];
+        block = xfs_btree_get_block(cur, 0, &bp);
+#ifdef DEBUG
+        error = xfs_btree_check_block(cur, block, 0, bp);
+        if (error)
+                return error;
+#endif
+        /*
+         * Off the right end or left end, return failure.
+         */
+        if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+                *stat = 0;
+                return 0;
+        }
+        /*
+         * Point to the record and extract its data.
+         */
+        *recp = xfs_btree_rec_addr(cur, ptr, block);
+        *stat = 1;
+        return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 1f528a2a3754..789fffdf8b2f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -39,39 +39,19 @@ extern kmem_zone_t	*xfs_btree_cur_zone;
 #define XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
 /*
- * Short form header: space allocation btrees.
+ * Generic btree header.
- */
+ *
-typedef struct xfs_btree_sblock {
+ * This is a comination of the actual format used on disk for short and long
-        __be32          bb_magic;       /* magic number for block type */
+ * format btrees.  The first three fields are shared by both format, but
-        __be16          bb_level;       /* 0 is a leaf */
+ * the pointers are different and should be used with care.
-        __be16          bb_numrecs;     /* current # of data records */
+ *
-        __be32          bb_leftsib;     /* left sibling block or NULLAGBLOCK */
+ * To get the size of the actual short or long form headers please use
-        __be32          bb_rightsib;    /* right sibling block or NULLAGBLOCK */
+ * the size macros below.  Never use sizeof(xfs_btree_block).
-} xfs_btree_sblock_t;
-/*
- * Long form header: bmap btrees.
- */
-typedef struct xfs_btree_lblock {
-        __be32          bb_magic;       /* magic number for block type */
-        __be16          bb_level;       /* 0 is a leaf */
-        __be16          bb_numrecs;     /* current # of data records */
-        __be64          bb_leftsib;     /* left sibling block or NULLDFSBNO */
-        __be64          bb_rightsib;    /* right sibling block or NULLDFSBNO */
-} xfs_btree_lblock_t;
-/*
- * Combined header and structure, used by common code.
 */
-typedef struct xfs_btree_hdr
+struct xfs_btree_block {
-{
        __be32          bb_magic;       /* magic number for block type */
        __be16          bb_level;       /* 0 is a leaf */
        __be16          bb_numrecs;     /* current # of data records */
-} xfs_btree_hdr_t;
-typedef struct xfs_btree_block {
-        xfs_btree_hdr_t bb_h;           /* header */
        union {
                struct {
                        __be32          bb_leftsib;
@@ -82,7 +62,36 @@ typedef struct xfs_btree_block {
                        __be64          bb_rightsib;
                } l;                    /* long form pointers */
        } bb_u;                         /* rest */
-} xfs_btree_block_t;
+};
+#define XFS_BTREE_SBLOCK_LEN    16      /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN    24      /* size of a long form block */
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+        __be32                  s;      /* short form ptr */
+        __be64                  l;      /* long form ptr */
+};
+union xfs_btree_key {
+        xfs_bmbt_key_t          bmbt;
+        xfs_bmdr_key_t          bmbr;   /* bmbt root block */
+        xfs_alloc_key_t         alloc;
+        xfs_inobt_key_t         inobt;
+};
+union xfs_btree_rec {
+        xfs_bmbt_rec_t          bmbt;
+        xfs_bmdr_rec_t          bmbr;   /* bmbt root block */
+        xfs_alloc_rec_t         alloc;
+        xfs_inobt_rec_t         inobt;
+};
 /*
 * For logging record fields.
@@ -96,46 +105,131 @@ typedef struct xfs_btree_block {
 #define XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
 /*
- * Boolean to select which form of xfs_btree_block_t.bb_u to use.
- */
-#define XFS_BTREE_LONG_PTRS(btnum)      ((btnum) == XFS_BTNUM_BMAP)
-/*
 * Magic numbers for btree blocks.
 */
 extern const __uint32_t xfs_magics[];
 /*
- * Maximum and minimum records in a btree block.
+ * Generic stats interface
- * Given block size, type prefix, and leaf flag (0 or 1).
+ */
- * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
+#define __XFS_BTREE_STATS_INC(type, stat) \
- * compiler warnings.
+        XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
- */
+#define XFS_BTREE_STATS_INC(cur, stat)  \
-#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf)       \
+do {    \
-        ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
+        switch (cur->bc_btnum) {  \
-         (((lf) * (uint)sizeof(t ## _rec_t)) + \
+        case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;   \
-          ((1 - (lf)) * \
+        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;   \
-           ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
+        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;  \
-#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf)       \
+        case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;    \
-        (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
+        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+        }       \
-/*
+} while (0)
- * Record, key, and pointer address calculation macros.
- * Given block size, type prefix, block pointer, and index of requested entry
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
- * (first entry numbered 1).
+        XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
- */
+#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
-#define XFS_BTREE_REC_ADDR(t,bb,i)      \
+do {    \
-        ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+        switch (cur->bc_btnum) {  \
-         ((i) - 1) * sizeof(t ## _rec_t)))
+        case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
-#define XFS_BTREE_KEY_ADDR(t,bb,i)      \
+        case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
-        ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+        case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
-         ((i) - 1) * sizeof(t ## _key_t)))
+        case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
-#define XFS_BTREE_PTR_ADDR(t,bb,i,mxr)  \
+        case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
-        ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
+        }       \
-         (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+} while (0)
 #define XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
+struct xfs_btree_ops {
+        /* size of the key and record structures */
+        size_t  key_len;
+        size_t  rec_len;
+        /* cursor operations */
+        struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+        void    (*update_cursor)(struct xfs_btree_cur *src,
+                                 struct xfs_btree_cur *dst);
+        /* update btree root pointer */
+        void    (*set_root)(struct xfs_btree_cur *cur,
+                                union xfs_btree_ptr *nptr, int level_change);
+        int     (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
+                                int level, union xfs_btree_ptr *newroot);
+        /* block allocation / freeing */
+        int     (*alloc_block)(struct xfs_btree_cur *cur,
+                               union xfs_btree_ptr *start_bno,
+                               union xfs_btree_ptr *new_bno,
+                               int length, int *stat);
+        int     (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+        /* update last record information */
+        void    (*update_lastrec)(struct xfs_btree_cur *cur,
+                                  struct xfs_btree_block *block,
+                                  union xfs_btree_rec *rec,
+                                  int ptr, int reason);
+        /* records in block/level */
+        int     (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+        int     (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+        /* records on disk.  Matter for the root in inode case. */
+        int     (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+        /* init values of btree structures */
+        void    (*init_key_from_rec)(union xfs_btree_key *key,
+                                     union xfs_btree_rec *rec);
+        void    (*init_rec_from_key)(union xfs_btree_key *key,
+                                     union xfs_btree_rec *rec);
+        void    (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+                                     union xfs_btree_rec *rec);
+        void    (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+                                     union xfs_btree_ptr *ptr);
+        /* difference between key value and cursor value */
+        __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+                              union xfs_btree_key *key);
+#ifdef DEBUG
+        /* check that k1 is lower than k2 */
+        int     (*keys_inorder)(struct xfs_btree_cur *cur,
+                                union xfs_btree_key *k1,
+                                union xfs_btree_key *k2);
+        /* check that r1 is lower than r2 */
+        int     (*recs_inorder)(struct xfs_btree_cur *cur,
+                                union xfs_btree_rec *r1,
+                                union xfs_btree_rec *r2);
+#endif
+        /* btree tracing */
+#ifdef XFS_BTREE_TRACE
+        void            (*trace_enter)(struct xfs_btree_cur *, const char *,
+                                       char *, int, int, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t,
+                                       __psunsigned_t, __psunsigned_t);
+        void            (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *,
+                                        __uint64_t *, __uint64_t *);
+        void            (*trace_key)(struct xfs_btree_cur *,
+                                     union xfs_btree_key *, __uint64_t *,
+                                     __uint64_t *);
+        void            (*trace_record)(struct xfs_btree_cur *,
+                                        union xfs_btree_rec *, __uint64_t *,
+                                        __uint64_t *, __uint64_t *);
+#endif
+};
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE  0
+#define LASTREC_INSREC  1
+#define LASTREC_DELREC  2
 /*
 * Btree cursor structure.
 * This collects all information needed by the btree code in one place.
@@ -144,6 +238,8 @@ typedef struct xfs_btree_cur
 {
        struct xfs_trans        *bc_tp; /* transaction we're in, if any */
        struct xfs_mount        *bc_mp; /* file system mount struct */
+        const struct xfs_btree_ops *bc_ops;
+        uint                    bc_flags; /* btree features - below */
        union {
                xfs_alloc_rec_incore_t  a;
                xfs_bmbt_irec_t         b;
@@ -175,94 +271,40 @@ typedef struct xfs_btree_cur
        }               bc_private;     /* per-btree type data */
 } xfs_btree_cur_t;
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS             (1<<0)  /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE         (1<<1)  /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE        (1<<2)  /* track last rec externally */
 #define XFS_BTREE_NOERROR       0
 #define XFS_BTREE_ERROR         1
 /*
 * Convert from buffer to btree block header.
 */
-#define XFS_BUF_TO_BLOCK(bp)    ((xfs_btree_block_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_LBLOCK(bp)   ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_SBLOCK(bp)   ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp))
-#ifdef __KERNEL__
-#ifdef DEBUG
 /*
- * Debug routine: check that block header is ok.
+ * Check that block header is ok.
 */
-void
+int
 xfs_btree_check_block(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_btree_block_t       *block, /* generic btree block pointer */
+        struct xfs_btree_block  *block, /* generic btree block pointer */
-        int                     level,  /* level of the btree block */
-        struct xfs_buf          *bp);   /* buffer containing block, if any */
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-        xfs_btnum_t             btnum,  /* btree identifier */
-        void                    *ak1,   /* pointer to left (lower) key */
-        void                    *ak2);  /* pointer to right (higher) key */
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-        xfs_btnum_t             btnum,  /* btree identifier */
-        void                    *ar1,   /* pointer to left (lower) record */
-        void                    *ar2);  /* pointer to right (higher) record */
-#else
-#define xfs_btree_check_block(a,b,c,d)
-#define xfs_btree_check_key(a,b,c)
-#define xfs_btree_check_rec(a,b,c)
-#endif  /* DEBUG */
-/*
- * Checking routine: check that long form block header is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_lblock_t      *block, /* btree long form block pointer */
        int                     level,  /* level of the btree block */
        struct xfs_buf          *bp);   /* buffer containing block, if any */
 /*
- * Checking routine: check that (long) pointer is ok.
+ * Check that (long) pointer is ok.
 */
 int                                     /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lptr(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_dfsbno_t            ptr,    /* btree block disk address */
        int                     level); /* btree block level */
-#define xfs_btree_check_lptr_disk(cur, ptr, level) \
-        xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level)
-/*
- * Checking routine: check that short form block header is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_btree_sblock_t      *block, /* btree short form block pointer */
-        int                     level,  /* level of the btree block */
-        struct xfs_buf          *bp);   /* buffer containing block */
-/*
- * Checking routine: check that (short) pointer is ok.
- */
-int                                     /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_agblock_t           ptr,    /* btree block disk address */
-        int                     level); /* btree block level */
 /*
 * Delete the btree cursor.
 */
@@ -281,15 +323,6 @@ xfs_btree_dup_cursor(
        xfs_btree_cur_t         **ncur);/* output cursor */
 /*
- * Change the cursor to point to the first record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                     /* success=1, failure=0 */
-xfs_btree_firstrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level); /* level to change */
-/*
 * Get a buffer for the block, return it with no data read.
 * Long-form addressing.
 */
@@ -313,20 +346,6 @@ xfs_btree_get_bufs(
        uint                    lock);  /* lock flags for get_buf */
 /*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B).
- */
-xfs_btree_cur_t *                       /* new btree cursor */
-xfs_btree_init_cursor(
-        struct xfs_mount        *mp,    /* file system mount point */
-        struct xfs_trans        *tp,    /* transaction pointer */
-        struct xfs_buf          *agbp,  /* (A only) buffer for agf structure */
-        xfs_agnumber_t          agno,   /* (A only) allocation group number */
-        xfs_btnum_t             btnum,  /* btree identifier */
-        struct xfs_inode        *ip,    /* (B only) inode owning the btree */
-        int                     whichfork); /* (B only) data/attr fork */
-/*
 * Check for the cursor referring to the last block at the given level.
 */
 int                                     /* 1=is last block, 0=not last block */
@@ -335,15 +354,6 @@ xfs_btree_islastblock(
        int                     level); /* level to check */
 /*
- * Change the cursor to point to the last record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                     /* success=1, failure=0 */
-xfs_btree_lastrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level); /* level to change */
-/*
 * Compute first and last byte offsets for the fields given.
 * Interprets the offsets table, which contains struct field offsets.
 */
@@ -404,39 +414,53 @@ xfs_btree_reada_bufs(
        xfs_extlen_t            count); /* count of filesystem blocks */
 /*
- * Read-ahead btree blocks, at the given level.
+ * Set the buffer for level "lev" in the cursor to bp, releasing
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ * any previous buffer.
 */
-int                                     /* readahead block count */
+void
-xfs_btree_readahead_core(
+xfs_btree_setbuf(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     lev,    /* level in btree */
-        int                     lr);    /* left/right bits */
+        struct xfs_buf          *bp);   /* new buffer to set */
-static inline int                       /* readahead block count */
-xfs_btree_readahead(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     lev,    /* level in btree */
-        int                     lr)     /* left/right bits */
-{
-        if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
-                return 0;
-        return xfs_btree_readahead_core(cur, lev, lr);
+/*
-}
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_kill_iroot(struct xfs_btree_cur *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
 /*
- * Set the buffer for level "lev" in the cursor to bp, releasing
+ * Helpers.
- * any previous buffer.
 */
-void
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
-xfs_btree_setbuf(
+{
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        return be16_to_cpu(block->bb_numrecs);
-        int                     lev,    /* level in btree */
+}
-        struct xfs_buf          *bp);   /* new buffer to set */
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+                __uint16_t numrecs)
+{
+        block->bb_numrecs = cpu_to_be16(numrecs);
+}
-#endif  /* __KERNEL__ */
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+        return be16_to_cpu(block->bb_level);
+}
 /*
diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c
new file mode 100644
index 000000000000..44ff942a0fda
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+STATIC void
+xfs_btree_trace_ptr(
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_ptr     ptr,
+        __psunsigned_t          *high,
+        __psunsigned_t          *low)
+{
+        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+                __u64 val = be64_to_cpu(ptr.l);
+                *high = val >> 32;
+                *low = (int)val;
+        } else {
+                *high = 0;
+                *low = be32_to_cpu(ptr.s);
+        }
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+void
+xfs_btree_trace_argbi(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *b,
+        int                     i,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI,
+                                 line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0,
+                                 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+void
+xfs_btree_trace_argbii(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        struct xfs_buf          *b,
+        int                     i0,
+        int                     i1,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII,
+                                 line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0,
+                                 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+void
+xfs_btree_trace_argfffi(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        xfs_dfiloff_t           o,
+        xfs_dfsbno_t            b,
+        xfs_dfilblks_t          i,
+        int                     j,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI,
+                                 line,
+                                 o >> 32, (int)o,
+                                 b >> 32, (int)b,
+                                 i >> 32, (int)i,
+                                 (int)j, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+void
+xfs_btree_trace_argi(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        int                     line)
+{
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI,
+                                 line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+void
+xfs_btree_trace_argipk(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        union xfs_btree_ptr     ptr,
+        union xfs_btree_key     *key,
+        int                     line)
+{
+        __psunsigned_t          high, low;
+        __uint64_t              l0, l1;
+        xfs_btree_trace_ptr(cur, ptr, &high, &low);
+        cur->bc_ops->trace_key(cur, key, &l0, &l1);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK,
+                                 line, i, high, low,
+                                 l0 >> 32, (int)l0,
+                                 l1 >> 32, (int)l1,
+                                 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+void
+xfs_btree_trace_argipr(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        union xfs_btree_ptr     ptr,
+        union xfs_btree_rec     *rec,
+        int                     line)
+{
+        __psunsigned_t          high, low;
+        __uint64_t              l0, l1, l2;
+        xfs_btree_trace_ptr(cur, ptr, &high, &low);
+        cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR,
+                              line, i,
+                              high, low,
+                              l0 >> 32, (int)l0,
+                              l1 >> 32, (int)l1,
+                              l2 >> 32, (int)l2,
+                              0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+void
+xfs_btree_trace_argik(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     i,
+        union xfs_btree_key     *key,
+        int                     line)
+{
+        __uint64_t              l0, l1;
+        cur->bc_ops->trace_key(cur, key, &l0, &l1);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK,
+                                 line, i,
+                                 l0 >> 32, (int)l0,
+                                 l1 >> 32, (int)l1,
+                                 0, 0, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for arguments, for record.
+ */
+void
+xfs_btree_trace_argr(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        union xfs_btree_rec     *rec,
+        int                     line)
+{
+        __uint64_t              l0, l1, l2;
+        cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+        cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR,
+                              line,
+                              l0 >> 32, (int)l0,
+                              l1 >> 32, (int)l1,
+                              l2 >> 32, (int)l2,
+                              0, 0, 0, 0, 0);
+}
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+void
+xfs_btree_trace_cursor(
+        const char              *func,
+        struct xfs_btree_cur    *cur,
+        int                     type,
+        int                     line)
+{
+        __uint32_t              s0;
+        __uint64_t              l0, l1;
+        char                    *s;
+        switch (type) {
+        case XBT_ARGS:
+                s = "args";
+                break;
+        case XBT_ENTRY:
+                s = "entry";
+                break;
+        case XBT_ERROR:
+                s = "error";
+                break;
+        case XBT_EXIT:
+                s = "exit";
+                break;
+        default:
+                s = "unknown";
+                break;
+        }
+        cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1);
+        cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line,
+                                 s0,
+                                 l0 >> 32, (int)l0,
+                                 l1 >> 32, (int)l1,
+                                 (__psunsigned_t)cur->bc_bufs[0],
+                                 (__psunsigned_t)cur->bc_bufs[1],
+                                 (__psunsigned_t)cur->bc_bufs[2],
+                                 (__psunsigned_t)cur->bc_bufs[3],
+                                 (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+                                 (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
new file mode 100644
index 000000000000..b3f5eb3c3c6c
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_TRACE_H__
+#define __XFS_BTREE_TRACE_H__
+struct xfs_btree_cur;
+struct xfs_buf;
+/*
+ * Trace hooks.
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+#ifdef XFS_BTREE_TRACE
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BTREE_KTRACE_ARGBI   1
+#define XFS_BTREE_KTRACE_ARGBII  2
+#define XFS_BTREE_KTRACE_ARGFFFI 3
+#define XFS_BTREE_KTRACE_ARGI    4
+#define XFS_BTREE_KTRACE_ARGIPK  5
+#define XFS_BTREE_KTRACE_ARGIPR  6
+#define XFS_BTREE_KTRACE_ARGIK   7
+#define XFS_BTREE_KTRACE_ARGR    8
+#define XFS_BTREE_KTRACE_CUR     9
+/*
+ * Sub-types for cursor traces.
+ */
+#define XBT_ARGS        0
+#define XBT_ENTRY       1
+#define XBT_ERROR       2
+#define XBT_EXIT        3
+void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
+                struct xfs_buf *, int, int);
+void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
+                struct xfs_buf *, int, int, int);
+void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
+                xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
+void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
+void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
+                union xfs_btree_ptr, union xfs_btree_key *, int);
+void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int,
+                union xfs_btree_ptr, union xfs_btree_rec *, int);
+void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int,
+                union xfs_btree_key *, int);
+void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
+                union xfs_btree_rec *, int);
+void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
+#define XFS_ALLOCBT_TRACE_SIZE  4096    /* size of global trace buffer */
+extern ktrace_t *xfs_allocbt_trace_buf;
+#define XFS_INOBT_TRACE_SIZE    4096    /* size of global trace buffer */
+extern ktrace_t *xfs_inobt_trace_buf;
+#define XFS_BMBT_TRACE_SIZE     4096    /* size of global trace buffer */
+#define XFS_BMBT_KTRACE_SIZE    32      /* size of per-inode trace buffer */
+extern ktrace_t *xfs_bmbt_trace_buf;
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)  \
+        xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)      \
+        xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
+#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)  \
+        xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
+#define XFS_BTREE_TRACE_ARGI(c, i)      \
+        xfs_btree_trace_argi(__func__, c, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k)      \
+        xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)      \
+        xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)  \
+        xfs_btree_trace_argik(__func__, c, i, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGR(c, r)      \
+        xfs_btree_trace_argr(__func__, c, r, __LINE__)
+#define XFS_BTREE_TRACE_CURSOR(c, t)    \
+        xfs_btree_trace_cursor(__func__, c, t, __LINE__)
+#else
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
+#define XFS_BTREE_TRACE_ARGI(c, i)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define XFS_BTREE_TRACE_CURSOR(c, t)
+#endif  /* XFS_BTREE_TRACE */
+#endif /* __XFS_BTREE_TRACE_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 002fc2617c8e..92af4098c7e8 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -375,7 +375,7 @@ xfs_buf_item_unpin(
        xfs_buf_log_item_t      *bip,
        int                     stale)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail  *ailp;
        xfs_buf_t       *bp;
        int             freed;
@@ -387,7 +387,7 @@ xfs_buf_item_unpin(
        xfs_buftrace("XFS_UNPIN", bp);
        freed = atomic_dec_and_test(&bip->bli_refcount);
-        mp = bip->bli_item.li_mountp;
+        ailp = bip->bli_item.li_ailp;
        xfs_bunpin(bp);
        if (freed && stale) {
                ASSERT(bip->bli_flags & XFS_BLI_STALE);
@@ -399,17 +399,17 @@ xfs_buf_item_unpin(
                xfs_buftrace("XFS_UNPIN STALE", bp);
                /*
                 * If we get called here because of an IO error, we may
-                 * or may not have the item on the AIL. xfs_trans_delete_ail()
+                 * or may not have the item on the AIL. xfs_trans_ail_delete()
                 * will take care of that situation.
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                 * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
-                        spin_lock(&mp->m_ail_lock);
+                        spin_lock(&ailp->xa_lock);
-                        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+                        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
                        xfs_buf_item_relse(bp);
                        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
                }
@@ -707,8 +707,8 @@ xfs_buf_item_init(
         * the first.  If we do already have one, there is
         * nothing to do here so return.
         */
-        if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
+        if (bp->b_mount != mp)
-                XFS_BUF_SET_FSPRIVATE3(bp, mp);
+                bp->b_mount = mp;
        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -731,6 +731,7 @@ xfs_buf_item_init(
        bip->bli_item.li_type = XFS_LI_BUF;
        bip->bli_item.li_ops = &xfs_buf_item_ops;
        bip->bli_item.li_mountp = mp;
+        bip->bli_item.li_ailp = mp->m_ail;
        bip->bli_buf = bp;
        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
@@ -997,21 +998,7 @@ xfs_buf_iodone_callbacks(
                        xfs_buf_do_callbacks(bp, lip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
+                        xfs_biodone(bp);
-                        /*
-                         * XFS_SHUT flag gets set when we go thru the
-                         * entire buffer cache and deliberately start
-                         * throwing away delayed write buffers.
-                         * Since there's no biowait done on those,
-                         * we should just brelse them.
-                         */
-                        if (XFS_BUF_ISSHUT(bp)) {
-                            XFS_BUF_UNSHUT(bp);
-                                xfs_buf_relse(bp);
-                        } else {
-                                xfs_biodone(bp);
-                        }
                        return;
                }
@@ -1122,27 +1109,23 @@ xfs_buf_iodone(
        xfs_buf_t               *bp,
        xfs_buf_log_item_t      *bip)
 {
-        struct xfs_mount        *mp;
+        struct xfs_ail          *ailp = bip->bli_item.li_ailp;
        ASSERT(bip->bli_buf == bp);
        xfs_buf_rele(bp);
-        mp = bip->bli_item.li_mountp;
        /*
         * If we are forcibly shutting down, this may well be
         * off the AIL already. That's because we simulate the
         * log-committed callbacks to unpin these buffers. Or we may never
         * have put this item on AIL because of the transaction was
-         * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+         * aborted forcibly. xfs_trans_ail_delete() takes care of these.
         *
         * Either way, AIL is useless if we're forcing a shutdown.
         */
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
-        /*
+        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
-         * xfs_trans_delete_ail() drops the AIL lock.
-         */
-        xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
        xfs_buf_item_free(bip);
 }
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
deleted file mode 100644
index d2ce5dd70d87..000000000000
--- a/fs/xfs/xfs_clnt.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CLNT_H__
-#define __XFS_CLNT_H__
-/*
- * XFS arguments structure, constructed from the arguments we
- * are passed via the mount system call.
- *
- * NOTE: The mount system call is handled differently between
- * Linux and IRIX.  In IRIX we worked work with a binary data
- * structure coming in across the syscall interface from user
- * space (the mount userspace knows about each filesystem type
- * and the set of valid options for it, and converts the users
- * argument string into a binary structure _before_ making the
- * system call), and the ABI issues that this implies.
- *
- * In Linux, we are passed a comma separated set of options;
- * ie. a NULL terminated string of characters.  Userspace mount
- * code does not have any knowledge of mount options expected by
- * each filesystem type and so each filesystem parses its mount
- * options in kernel space.
- *
- * For the Linux port, we kept this structure pretty much intact
- * and use it internally (because the existing code groks it).
- */
-struct xfs_mount_args {
-        int     flags;          /* flags -> see XFSMNT_... macros below */
-        int     flags2;         /* flags -> see XFSMNT2_... macros below */
-        int     logbufs;        /* Number of log buffers, -1 to default */
-        int     logbufsize;     /* Size of log buffers, -1 to default */
-        char    fsname[MAXNAMELEN+1];   /* data device name */
-        char    rtname[MAXNAMELEN+1];   /* realtime device filename */
-        char    logname[MAXNAMELEN+1];  /* journal device filename */
-        char    mtpt[MAXNAMELEN+1];     /* filesystem mount point */
-        int     sunit;          /* stripe unit (BBs) */
-        int     swidth;         /* stripe width (BBs), multiple of sunit */
-        uchar_t iosizelog;      /* log2 of the preferred I/O size */
-        int     ihashsize;      /* inode hash table size (buckets) */
-};
-/*
- * XFS mount option flags -- args->flags1
- */
-#define XFSMNT_ATTR2            0x00000001      /* allow ATTR2 EA format */
-#define XFSMNT_WSYNC            0x00000002      /* safe mode nfs mount
-                                                 * compatible */
-#define XFSMNT_INO64            0x00000004      /* move inode numbers up
-                                                 * past 2^32 */
-#define XFSMNT_UQUOTA           0x00000008      /* user quota accounting */
-#define XFSMNT_PQUOTA           0x00000010      /* IRIX prj quota accounting */
-#define XFSMNT_UQUOTAENF        0x00000020      /* user quota limit
-                                                 * enforcement */
-#define XFSMNT_PQUOTAENF        0x00000040      /* IRIX project quota limit
-                                                 * enforcement */
-#define XFSMNT_QUIET            0x00000080      /* don't report mount errors */
-#define XFSMNT_NOALIGN          0x00000200      /* don't allocate at
-                                                 * stripe boundaries*/
-#define XFSMNT_RETERR           0x00000400      /* return error to user */
-#define XFSMNT_NORECOVERY       0x00000800      /* no recovery, implies
-                                                 * read-only mount */
-#define XFSMNT_SHARED           0x00001000      /* shared XFS mount */
-#define XFSMNT_IOSIZE           0x00002000      /* optimize for I/O size */
-#define XFSMNT_OSYNCISOSYNC     0x00004000      /* o_sync is REALLY o_sync */
-                                                /* (osyncisdsync is default) */
-#define XFSMNT_NOATTR2          0x00008000      /* turn off ATTR2 EA format */
-#define XFSMNT_32BITINODES      0x00200000      /* restrict inodes to 32
-                                                 * bits of address space */
-#define XFSMNT_GQUOTA           0x00400000      /* group quota accounting */
-#define XFSMNT_GQUOTAENF        0x00800000      /* group quota limit
-                                                 * enforcement */
-#define XFSMNT_NOUUID           0x01000000      /* Ignore fs uuid */
-#define XFSMNT_DMAPI            0x02000000      /* enable dmapi/xdsm */
-#define XFSMNT_BARRIER          0x04000000      /* use write barriers */
-#define XFSMNT_IKEEP            0x08000000      /* inode cluster delete */
-#define XFSMNT_SWALLOC          0x10000000      /* turn on stripe width
-                                                 * allocation */
-#define XFSMNT_DIRSYNC          0x40000000      /* sync creat,link,unlink,rename
-                                                 * symlink,mkdir,rmdir,mknod */
-#define XFSMNT_FLAGS2           0x80000000      /* more flags set in flags2 */
-/*
- * XFS mount option flags -- args->flags2
- */
-#define XFSMNT2_COMPAT_IOSIZE   0x00000001      /* don't report large preferred
-                                                 * I/O size in stat(2) */
-#define XFSMNT2_FILESTREAMS     0x00000002      /* enable the filestreams
-                                                 * allocator */
-#endif  /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 8be0b00ede9a..70b710c1792d 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -72,27 +72,7 @@ typedef struct xfs_da_intnode {
 typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
 typedef struct xfs_da_node_entry xfs_da_node_entry_t;
-#define XFS_DA_MAXHASH  ((xfs_dahash_t)-1) /* largest valid hash value */
 #define XFS_LBSIZE(mp)  (mp)->m_sb.sb_blocksize
-#define XFS_LBLOG(mp)   (mp)->m_sb.sb_blocklog
-#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry)      \
-        (((bno) << (mp)->m_dircook_elog) | (entry))
-#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash)   \
-        (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
-#define XFS_DA_COOKIE_HASH(mp,cookie)           ((xfs_dahash_t)cookie)
-#define XFS_DA_COOKIE_BNO(mp,cookie)            \
-        ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-                (xfs_dablk_t)0 : \
-                (xfs_dablk_t)((xfs_off_t)(cookie) >> \
-                                ((mp)->m_dircook_elog + 32))))
-#define XFS_DA_COOKIE_ENTRY(mp,cookie)          \
-        ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-                (xfs_dablk_t)0 : \
-                (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
-                                ((1 << (mp)->m_dircook_elog) - 1))))
 /*========================================================================
 * Btree searching and modification structure definitions.
@@ -226,9 +206,8 @@ struct xfs_nameops {
 };
-#ifdef __KERNEL__
 /*========================================================================
- * Function prototypes for the kernel.
+ * Function prototypes.
 *========================================================================*/
 /*
@@ -289,6 +268,5 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
 extern struct kmem_zone *xfs_da_state_zone;
 extern struct kmem_zone *xfs_dabuf_zone;
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 75b0cd4da0ea..b4c1ee713492 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -49,9 +49,8 @@
 */
 int
 xfs_swapext(
-        xfs_swapext_t   __user *sxu)
+        xfs_swapext_t   *sxp)
 {
-        xfs_swapext_t   *sxp;
        xfs_inode_t     *ip, *tip;
        struct file     *file, *target_file;
        int             error = 0;
@@ -62,11 +61,6 @@ xfs_swapext(
                goto out;
        }
-        if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) {
-                error = XFS_ERROR(EFAULT);
-                goto out_free_sxp;
-        }
        /* Pull information for the target fd */
        file = fget((int)sxp->sx_fdtarget);
        if (!file) {
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index da178205be68..4f55a6306558 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -46,7 +46,7 @@ typedef struct xfs_swapext
 /*
 * Syscall interface for xfs_swapext
 */
-int     xfs_swapext(struct xfs_swapext __user *sx);
+int     xfs_swapext(struct xfs_swapext *sx);
 int     xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
                struct xfs_swapext *sxp);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index c9065eaf2a4d..162e8726df5e 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -18,32 +18,29 @@
 #ifndef __XFS_DINODE_H__
 #define __XFS_DINODE_H__
-struct xfs_buf;
+#define XFS_DINODE_MAGIC                0x494e  /* 'IN' */
-struct xfs_mount;
+#define XFS_DINODE_GOOD_VERSION(v)      (((v) == 1 || (v) == 2))
-#define XFS_DINODE_VERSION_1    1
-#define XFS_DINODE_VERSION_2    2
-#define XFS_DINODE_GOOD_VERSION(v)      \
-        (((v) == XFS_DINODE_VERSION_1 || (v) == XFS_DINODE_VERSION_2))
-#define XFS_DINODE_MAGIC        0x494e  /* 'IN' */
-/*
- * Disk inode structure.
- * This is just the header; the inode is expanded to fill a variable size
- * with the last field expanding.  It is split into the core and "other"
- * because we only need the core part in the in-core inode.
- */
 typedef struct xfs_timestamp {
        __be32          t_sec;          /* timestamp seconds */
        __be32          t_nsec;         /* timestamp nanoseconds */
 } xfs_timestamp_t;
 /*
- * Note: Coordinate changes to this structure with the XFS_DI_* #defines
+ * On-disk inode structure.
- * below, the offsets table in xfs_ialloc_log_di() and struct xfs_icdinode
+ *
- * in xfs_inode.h.
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat.  To access the data and
+ * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
 */
-typedef struct xfs_dinode_core {
+typedef struct xfs_dinode {
        __be16          di_magic;       /* inode magic # = XFS_DINODE_MAGIC */
        __be16          di_mode;        /* mode and type of file */
        __u8            di_version;     /* inode version */
@@ -69,34 +66,12 @@ typedef struct xfs_dinode_core {
        __be16          di_dmstate;     /* DMIG state info */
        __be16          di_flags;       /* random flags, XFS_DIFLAG_... */
        __be32          di_gen;         /* generation number */
-} xfs_dinode_core_t;
-#define DI_MAX_FLUSH 0xffff
+        /* di_next_unlinked is the only non-core field in the old dinode */
+        __be32          di_next_unlinked;/* agi unlinked list ptr */
+} __attribute__((packed)) xfs_dinode_t;
-typedef struct xfs_dinode
+#define DI_MAX_FLUSH 0xffff
-{
-        xfs_dinode_core_t       di_core;
-        /*
-         * In adding anything between the core and the union, be
-         * sure to update the macros like XFS_LITINO below and
-         * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
-         */
-        __be32                  di_next_unlinked;/* agi unlinked list ptr */
-        union {
-                xfs_bmdr_block_t di_bmbt;       /* btree root block */
-                xfs_bmbt_rec_32_t di_bmx[1];    /* extent list */
-                xfs_dir2_sf_t   di_dir2sf;      /* shortform directory v2 */
-                char            di_c[1];        /* local contents */
-                __be32          di_dev;         /* device for S_IFCHR/S_IFBLK */
-                uuid_t          di_muuid;       /* mount point value */
-                char            di_symlink[1];  /* local symbolic link */
-        }               di_u;
-        union {
-                xfs_bmdr_block_t di_abmbt;      /* btree root block */
-                xfs_bmbt_rec_32_t di_abmx[1];   /* extent list */
-                xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
-        }               di_a;
-} xfs_dinode_t;
 /*
 * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
@@ -107,50 +82,14 @@ typedef struct xfs_dinode
 #define XFS_MAXLINK_1           65535U
 /*
- * Bit names for logging disk inodes only
- */
-#define XFS_DI_MAGIC            0x0000001
-#define XFS_DI_MODE             0x0000002
-#define XFS_DI_VERSION          0x0000004
-#define XFS_DI_FORMAT           0x0000008
-#define XFS_DI_ONLINK           0x0000010
-#define XFS_DI_UID              0x0000020
-#define XFS_DI_GID              0x0000040
-#define XFS_DI_NLINK            0x0000080
-#define XFS_DI_PROJID           0x0000100
-#define XFS_DI_PAD              0x0000200
-#define XFS_DI_ATIME            0x0000400
-#define XFS_DI_MTIME            0x0000800
-#define XFS_DI_CTIME            0x0001000
-#define XFS_DI_SIZE             0x0002000
-#define XFS_DI_NBLOCKS          0x0004000
-#define XFS_DI_EXTSIZE          0x0008000
-#define XFS_DI_NEXTENTS         0x0010000
-#define XFS_DI_NAEXTENTS        0x0020000
-#define XFS_DI_FORKOFF          0x0040000
-#define XFS_DI_AFORMAT          0x0080000
-#define XFS_DI_DMEVMASK         0x0100000
-#define XFS_DI_DMSTATE          0x0200000
-#define XFS_DI_FLAGS            0x0400000
-#define XFS_DI_GEN              0x0800000
-#define XFS_DI_NEXT_UNLINKED    0x1000000
-#define XFS_DI_U                0x2000000
-#define XFS_DI_A                0x4000000
-#define XFS_DI_NUM_BITS         27
-#define XFS_DI_ALL_BITS         ((1 << XFS_DI_NUM_BITS) - 1)
-#define XFS_DI_CORE_BITS        (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
-/*
 * Values for di_format
 */
-typedef enum xfs_dinode_fmt
+typedef enum xfs_dinode_fmt {
-{
+        XFS_DINODE_FMT_DEV,             /* xfs_dev_t */
-        XFS_DINODE_FMT_DEV,             /* CHR, BLK: di_dev */
+        XFS_DINODE_FMT_LOCAL,           /* bulk data */
-        XFS_DINODE_FMT_LOCAL,           /* DIR, REG: di_c */
+        XFS_DINODE_FMT_EXTENTS,         /* struct xfs_bmbt_rec */
-                                        /* LNK: di_symlink */
+        XFS_DINODE_FMT_BTREE,           /* struct xfs_bmdr_block */
-        XFS_DINODE_FMT_EXTENTS,         /* DIR, REG, LNK: di_bmx */
+        XFS_DINODE_FMT_UUID             /* uuid_t */
-        XFS_DINODE_FMT_BTREE,           /* DIR, REG, LNK: di_bmbt */
-        XFS_DINODE_FMT_UUID             /* MNT: di_uuid */
 } xfs_dinode_fmt_t;
 /*
@@ -166,13 +105,13 @@ typedef enum xfs_dinode_fmt
 */
 #define XFS_LITINO(mp)  ((mp)->m_litino)
 #define XFS_BROOT_SIZE_ADJ      \
-        (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+        (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
 /*
 * Inode data & attribute fork sizes, per inode.
 */
-#define XFS_DFORK_Q(dip)                ((dip)->di_core.di_forkoff != 0)
+#define XFS_DFORK_Q(dip)                ((dip)->di_forkoff != 0)
-#define XFS_DFORK_BOFF(dip)             ((int)((dip)->di_core.di_forkoff << 3))
+#define XFS_DFORK_BOFF(dip)             ((int)((dip)->di_forkoff << 3))
 #define XFS_DFORK_DSIZE(dip,mp) \
        (XFS_DFORK_Q(dip) ? \
@@ -187,23 +126,42 @@ typedef enum xfs_dinode_fmt
                XFS_DFORK_DSIZE(dip, mp) : \
                XFS_DFORK_ASIZE(dip, mp))
-#define XFS_DFORK_DPTR(dip)                 ((dip)->di_u.di_c)
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+        ((char *)(dip) + sizeof(struct xfs_dinode))
 #define XFS_DFORK_APTR(dip)     \
-        ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip))
+        (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
 #define XFS_DFORK_PTR(dip,w)    \
        ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
 #define XFS_DFORK_FORMAT(dip,w) \
        ((w) == XFS_DATA_FORK ? \
-                (dip)->di_core.di_format : \
+                (dip)->di_format : \
-                (dip)->di_core.di_aformat)
+                (dip)->di_aformat)
 #define XFS_DFORK_NEXTENTS(dip,w) \
        ((w) == XFS_DATA_FORK ? \
-                be32_to_cpu((dip)->di_core.di_nextents) : \
+                be32_to_cpu((dip)->di_nextents) : \
-                be16_to_cpu((dip)->di_core.di_anextents))
+                be16_to_cpu((dip)->di_anextents))
 #define XFS_BUF_TO_DINODE(bp)   ((xfs_dinode_t *)XFS_BUF_PTR(bp))
 /*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+        return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+        *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+/*
 * Values for di_flags
 * There should be a one-to-one correspondence between these flags and the
 * XFS_XFLAG_s.
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
index deecc9d238f8..6ac44b550d39 100644
--- a/fs/xfs/xfs_dir2_sf.h
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -34,13 +34,6 @@ struct xfs_mount;
 struct xfs_trans;
 /*
- * Maximum size of a shortform directory.
- */
-#define XFS_DIR2_SF_MAX_SIZE    \
-        (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
-         (uint)sizeof(xfs_agino_t))
-/*
 * Inode number stored as 8 8-bit values.
 */
 typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
index a1e55fb9d5dd..e71e2581c0c3 100644
--- a/fs/xfs/xfs_dmops.c
+++ b/fs/xfs/xfs_dmops.c
@@ -25,7 +25,6 @@
 #include "xfs_inum.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_clnt.h"
 static struct xfs_dmops xfs_dmcore_stub = {
@@ -38,9 +37,9 @@ static struct xfs_dmops xfs_dmcore_stub = {
 };
 int
-xfs_dmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_dmops_get(struct xfs_mount *mp)
 {
-        if (args->flags & XFSMNT_DMAPI) {
+        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                cmn_err(CE_WARN,
                        "XFS: dmapi support not available in this kernel.");
                return EINVAL;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index f227ecd1a294..92d5cd5bf4f2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -153,21 +153,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
 }
 #endif /* DEBUG */
-static void
-xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
-{
-        if (mp != NULL) {
-                char    *newfmt;
-                int     len = 16 + mp->m_fsname_len + strlen(fmt);
-                newfmt = kmem_alloc(len, KM_SLEEP);
-                sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
-                icmn_err(level, newfmt, ap);
-                kmem_free(newfmt);
-        } else {
-                icmn_err(level, fmt, ap);
-        }
-}
 void
 xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 11543f10b0c6..0c93051c4651 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -159,11 +159,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
 #define         XFS_PTAG_FSBLOCK_ZERO           0x00000080
 struct xfs_mount;
-/* PRINTFLIKE4 */
+extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
+                char *fmt, va_list ap)
+        __attribute__ ((format (printf, 3, 0)));
 extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
-                        char *fmt, ...);
+                        char *fmt, ...)
-/* PRINTFLIKE3 */
+        __attribute__ ((format (printf, 4, 5)));
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
+extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
 extern void xfs_hex_dump(void *p, int length);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 8aa28f751b2a..05a4bdd4be39 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -108,19 +108,16 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
 STATIC void
 xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-        mp = efip->efi_item.li_mountp;
+        spin_lock(&ailp->xa_lock);
-        spin_lock(&mp->m_ail_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
-                /*
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
 }
@@ -134,26 +131,23 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 STATIC void
 xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
        xfs_log_item_desc_t     *lidp;
-        mp = efip->efi_item.li_mountp;
+        spin_lock(&ailp->xa_lock);
-        spin_lock(&mp->m_ail_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
                /*
                 * free the xaction descriptor pointing to this item
                 */
                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
                xfs_trans_free_item(tp, lidp);
-                /*
-                 * pull the item off the AIL.
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
 }
@@ -268,6 +262,7 @@ xfs_efi_init(xfs_mount_t	*mp,
        efip->efi_item.li_type = XFS_LI_EFI;
        efip->efi_item.li_ops = &xfs_efi_item_ops;
        efip->efi_item.li_mountp = mp;
+        efip->efi_item.li_ailp = mp->m_ail;
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
@@ -345,25 +340,22 @@ void
 xfs_efi_release(xfs_efi_log_item_t      *efip,
                uint                    nextents)
 {
-        xfs_mount_t     *mp;
+        struct xfs_ail          *ailp = efip->efi_item.li_ailp;
-        int             extents_left;
+        int                     extents_left;
-        mp = efip->efi_item.li_mountp;
        ASSERT(efip->efi_next_extent > 0);
        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
        ASSERT(efip->efi_next_extent >= nextents);
        efip->efi_next_extent -= nextents;
        extents_left = efip->efi_next_extent;
        if (extents_left == 0) {
-                /*
+                /* xfs_trans_ail_delete() drops the AIL lock. */
-                 * xfs_trans_delete_ail() drops the AIL lock.
+                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
-                 */
-                xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
 }
@@ -565,6 +557,7 @@ xfs_efd_init(xfs_mount_t	*mp,
        efdp->efd_item.li_type = XFS_LI_EFD;
        efdp->efd_item.li_ops = &xfs_efd_item_ops;
        efdp->efd_item.li_mountp = mp;
+        efdp->efd_item.li_ailp = mp->m_ail;
        efdp->efd_efip = efip;
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 01c0cc88d3f3..f7c06fac8229 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -113,22 +113,14 @@ struct getbmapx {
 #define BMV_IF_ATTRFORK         0x1     /* return attr fork rather than data */
 #define BMV_IF_NO_DMAPI_READ    0x2     /* Do not generate DMAPI read event  */
 #define BMV_IF_PREALLOC         0x4     /* rtn status BMV_OF_PREALLOC if req */
-#define BMV_IF_VALID    (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
+#define BMV_IF_DELALLOC         0x8     /* rtn status BMV_OF_DELALLOC if req */
-#ifdef __KERNEL__
+#define BMV_IF_VALID    \
-#define BMV_IF_EXTENDED 0x40000000      /* getpmapx if set */
+        (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
-#endif
 /*      bmv_oflags values - returned for for each non-header segment */
 #define BMV_OF_PREALLOC         0x1     /* segment = unwritten pre-allocation */
+#define BMV_OF_DELALLOC         0x2     /* segment = delayed allocation */
-/*      Convert getbmap <-> getbmapx - move fields from p1 to p2. */
+#define BMV_OF_LAST             0x4     /* segment is the last in the file */
-#define GETBMAP_CONVERT(p1,p2) {        \
-        p2.bmv_offset = p1.bmv_offset;  \
-        p2.bmv_block = p1.bmv_block;    \
-        p2.bmv_length = p1.bmv_length;  \
-        p2.bmv_count = p1.bmv_count;    \
-        p2.bmv_entries = p1.bmv_entries;  }
 /*
 * Structure for XFS_IOC_FSSETDM.
@@ -426,10 +418,6 @@ typedef struct xfs_handle {
 #define XFS_IOC_GETXFLAGS       FS_IOC_GETFLAGS
 #define XFS_IOC_SETXFLAGS       FS_IOC_SETFLAGS
 #define XFS_IOC_GETVERSION      FS_IOC_GETVERSION
-/* 32-bit compat counterparts */
-#define XFS_IOC32_GETXFLAGS     FS_IOC32_GETFLAGS
-#define XFS_IOC32_SETXFLAGS     FS_IOC32_SETFLAGS
-#define XFS_IOC32_GETVERSION    FS_IOC32_GETVERSION
 /*
 * ioctl commands that replace IRIX fcntl()'s
@@ -477,8 +465,8 @@ typedef struct xfs_handle {
 #define XFS_IOC_ERROR_INJECTION      _IOW ('X', 116, struct xfs_error_injection)
 #define XFS_IOC_ERROR_CLEARALL       _IOW ('X', 117, struct xfs_error_injection)
 /*      XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118      */
-#define XFS_IOC_FREEZE               _IOWR('X', 119, int)
+/*      XFS_IOC_FREEZE            -- FIFREEZE   119      */
-#define XFS_IOC_THAW                 _IOWR('X', 120, int)
+/*      XFS_IOC_THAW              -- FITHAW     120      */
 #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
 #define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 84583cf73db3..680d0e0ec932 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -126,7 +126,7 @@ xfs_growfs_data_private(
        xfs_extlen_t            agsize;
        xfs_extlen_t            tmpsize;
        xfs_alloc_rec_t         *arec;
-        xfs_btree_sblock_t      *block;
+        struct xfs_btree_block  *block;
        xfs_buf_t               *bp;
        int                     bucket;
        int                     dpct;
@@ -251,14 +251,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-                block = XFS_BUF_TO_SBLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -272,14 +272,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-                block = XFS_BUF_TO_SBLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -294,13 +294,13 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-                block = XFS_BUF_TO_SBLOCK(bp);
+                block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = 0;
-                block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
                error = xfs_bwrite(mp, bp);
                if (error) {
                        goto error0;
@@ -435,6 +435,9 @@ xfs_growfs_data(
        xfs_growfs_data_t       *in)
 {
        int error;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
        if (!mutex_trylock(&mp->m_growlock))
                return XFS_ERROR(EWOULDBLOCK);
        error = xfs_growfs_data_private(mp, in);
@@ -448,6 +451,9 @@ xfs_growfs_log(
        xfs_growfs_log_t        *in)
 {
        int error;
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
        if (!mutex_trylock(&mp->m_growlock))
                return XFS_ERROR(EWOULDBLOCK);
        error = xfs_growfs_log_private(mp, in);
@@ -589,17 +595,19 @@ out:
        return 0;
 }
-void
+int
 xfs_fs_log_dummy(
        xfs_mount_t     *mp)
 {
        xfs_trans_t     *tp;
        xfs_inode_t     *ip;
+        int             error;
        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-        if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
+        error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+        if (error) {
                xfs_trans_cancel(tp, 0);
-                return;
+                return error;
        }
        ip = mp->m_rootip;
@@ -609,9 +617,10 @@ xfs_fs_log_dummy(
        xfs_trans_ihold(tp, ip);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        xfs_trans_set_sync(tp);
-        xfs_trans_commit(tp, 0);
+        error = xfs_trans_commit(tp, 0);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        return error;
 }
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 300d0c9d61ad..88435e0a77c9 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern void xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp);
 #endif  /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index aad8c5da38af..e6ebbaeb4dc6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -41,68 +41,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
-/*
- * Log specified fields for the inode given by bp and off.
- */
-STATIC void
-xfs_ialloc_log_di(
-        xfs_trans_t     *tp,            /* transaction pointer */
-        xfs_buf_t       *bp,            /* inode buffer */
-        int             off,            /* index of inode in buffer */
-        int             fields)         /* bitmask of fields to log */
-{
-        int                     first;          /* first byte number */
-        int                     ioffset;        /* off in bytes */
-        int                     last;           /* last byte number */
-        xfs_mount_t             *mp;            /* mount point structure */
-        static const short      offsets[] = {   /* field offsets */
-                                                /* keep in sync with bits */
-                offsetof(xfs_dinode_core_t, di_magic),
-                offsetof(xfs_dinode_core_t, di_mode),
-                offsetof(xfs_dinode_core_t, di_version),
-                offsetof(xfs_dinode_core_t, di_format),
-                offsetof(xfs_dinode_core_t, di_onlink),
-                offsetof(xfs_dinode_core_t, di_uid),
-                offsetof(xfs_dinode_core_t, di_gid),
-                offsetof(xfs_dinode_core_t, di_nlink),
-                offsetof(xfs_dinode_core_t, di_projid),
-                offsetof(xfs_dinode_core_t, di_pad),
-                offsetof(xfs_dinode_core_t, di_atime),
-                offsetof(xfs_dinode_core_t, di_mtime),
-                offsetof(xfs_dinode_core_t, di_ctime),
-                offsetof(xfs_dinode_core_t, di_size),
-                offsetof(xfs_dinode_core_t, di_nblocks),
-                offsetof(xfs_dinode_core_t, di_extsize),
-                offsetof(xfs_dinode_core_t, di_nextents),
-                offsetof(xfs_dinode_core_t, di_anextents),
-                offsetof(xfs_dinode_core_t, di_forkoff),
-                offsetof(xfs_dinode_core_t, di_aformat),
-                offsetof(xfs_dinode_core_t, di_dmevmask),
-                offsetof(xfs_dinode_core_t, di_dmstate),
-                offsetof(xfs_dinode_core_t, di_flags),
-                offsetof(xfs_dinode_core_t, di_gen),
-                offsetof(xfs_dinode_t, di_next_unlinked),
-                offsetof(xfs_dinode_t, di_u),
-                offsetof(xfs_dinode_t, di_a),
-                sizeof(xfs_dinode_t)
-        };
-        ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
-        ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
-        mp = tp->t_mountp;
-        /*
-         * Get the inode-relative first and last bytes for these fields
-         */
-        xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
-        /*
-         * Convert to buffer offsets and log it.
-         */
-        ioffset = off << mp->m_sb.sb_inodelog;
-        first += ioffset;
-        last += ioffset;
-        xfs_trans_log_buf(tp, bp, first, last);
-}
 /*
 * Allocation group level functions.
@@ -119,6 +57,102 @@ xfs_ialloc_cluster_alignment(
 }
 /*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+STATIC int                              /* error */
+xfs_inobt_lookup_eq(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_ge(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int                                     /* error */
+xfs_inobt_lookup_le(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free,   /* free inode mask */
+        int                     *stat)  /* success/failure */
+{
+        cur->bc_rec.i.ir_startino = ino;
+        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_free = free;
+        return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+/*
+ * Update the record referred to by cur to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                              /* error */
+xfs_inobt_update(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             ino,    /* starting inode of chunk */
+        __int32_t               fcnt,   /* free inode count */
+        xfs_inofree_t           free)   /* free inode mask */
+{
+        union xfs_btree_rec     rec;
+        rec.inobt.ir_startino = cpu_to_be32(ino);
+        rec.inobt.ir_freecount = cpu_to_be32(fcnt);
+        rec.inobt.ir_free = cpu_to_be64(free);
+        return xfs_btree_update(cur, &rec);
+}
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                     /* error */
+xfs_inobt_get_rec(
+        struct xfs_btree_cur    *cur,   /* btree cursor */
+        xfs_agino_t             *ino,   /* output: starting inode of chunk */
+        __int32_t               *fcnt,  /* output: number of free inodes */
+        xfs_inofree_t           *free,  /* output: free inode mask */
+        int                     *stat)  /* output: success/failure */
+{
+        union xfs_btree_rec     *rec;
+        int                     error;
+        error = xfs_btree_get_rec(cur, &rec, stat);
+        if (!error && *stat == 1) {
+                *ino = be32_to_cpu(rec->inobt.ir_startino);
+                *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
+                *free = be64_to_cpu(rec->inobt.ir_free);
+        }
+        return error;
+}
+/*
 * Allocate new inodes in the allocation group specified by agbp.
 * Return 0 for success, else error code.
 */
@@ -287,9 +321,9 @@ xfs_ialloc_ag_alloc(
         * able to use the file system.
         */
        if (xfs_sb_version_hasnlink(&args.mp->m_sb))
-                version = XFS_DINODE_VERSION_2;
+                version = 2;
        else
-                version = XFS_DINODE_VERSION_1;
+                version = 1;
        /*
         * Seed the new inode cluster with a random generation number. This
@@ -310,18 +344,25 @@ xfs_ialloc_ag_alloc(
                                         XFS_BUF_LOCK);
                ASSERT(fbuf);
                ASSERT(!XFS_BUF_GETERROR(fbuf));
                /*
-                 * Set initial values for the inodes in this buffer.
+                 * Initialize all inodes in this buffer and then log them.
+                 *
+                 * XXX: It would be much better if we had just one transaction to
+                 *      log a whole cluster of inodes instead of all the indivdual
+                 *      transactions causing a lot of log traffic.
                 */
                xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
                for (i = 0; i < ninodes; i++) {
+                        int     ioffset = i << args.mp->m_sb.sb_inodelog;
+                        uint    isize = sizeof(struct xfs_dinode);
                        free = XFS_MAKE_IPTR(args.mp, fbuf, i);
-                        free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-                        free->di_core.di_version = version;
+                        free->di_version = version;
-                        free->di_core.di_gen = cpu_to_be32(gen);
+                        free->di_gen = cpu_to_be32(gen);
                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        xfs_ialloc_log_di(tp, fbuf, i,
+                        xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
-                                XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
                }
                xfs_trans_inode_alloc_buf(tp, fbuf);
        }
@@ -335,8 +376,7 @@ xfs_ialloc_ag_alloc(
        /*
         * Insert records describing the new inode chunk into the btree.
         */
-        cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno,
+        cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
-                        XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
        for (thisino = newino;
             thisino < newino + newlen;
             thisino += XFS_INODES_PER_CHUNK) {
@@ -346,7 +386,7 @@ xfs_ialloc_ag_alloc(
                        return error;
                }
                ASSERT(i == 0);
-                if ((error = xfs_inobt_insert(cur, &i))) {
+                if ((error = xfs_btree_insert(cur, &i))) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
                }
@@ -676,8 +716,7 @@ nextag:
         */
        agno = tagno;
        *IO_agbp = NULL;
-        cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno),
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
-                                    XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
        /*
         * If pagino is 0 (this is the root inode allocation) use newino.
         * This must work because we've just allocated some.
@@ -697,7 +736,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
@@ -741,7 +780,7 @@ nextag:
                        /*
                         * Search left with tcur, back up 1 record.
                         */
-                        if ((error = xfs_inobt_decrement(tcur, 0, &i)))
+                        if ((error = xfs_btree_decrement(tcur, 0, &i)))
                                goto error1;
                        doneleft = !i;
                        if (!doneleft) {
@@ -755,7 +794,7 @@ nextag:
                        /*
                         * Search right with cur, go forward 1 record.
                         */
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error1;
                        doneright = !i;
                        if (!doneright) {
@@ -817,7 +856,7 @@ nextag:
                                 * further left.
                                 */
                                if (useleft) {
-                                        if ((error = xfs_inobt_decrement(tcur, 0,
+                                        if ((error = xfs_btree_decrement(tcur, 0,
                                                        &i)))
                                                goto error1;
                                        doneleft = !i;
@@ -837,7 +876,7 @@ nextag:
                                 * further right.
                                 */
                                else {
-                                        if ((error = xfs_inobt_increment(cur, 0,
+                                        if ((error = xfs_btree_increment(cur, 0,
                                                        &i)))
                                                goto error1;
                                        doneright = !i;
@@ -892,7 +931,7 @@ nextag:
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (rec.ir_freecount > 0)
                                        break;
-                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        }
@@ -926,7 +965,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
+                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
@@ -1022,8 +1061,7 @@ xfs_difree(
        /*
         * Initialize the cursor.
         */
-        cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
+        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-                (xfs_inode_t *)0, 0);
 #ifdef DEBUG
        if (cur->bc_nlevels == 1) {
                int freecount = 0;
@@ -1036,7 +1074,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1098,8 +1136,8 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
-                if ((error = xfs_inobt_delete(cur, &i))) {
+                if ((error = xfs_btree_delete(cur, &i))) {
-                        cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
+                        cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
                                error, mp->m_fsname);
                        goto error0;
                }
@@ -1141,7 +1179,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                                if ((error = xfs_inobt_increment(cur, 0, &i)))
+                                if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1158,36 +1196,28 @@ error0:
 }
 /*
- * Return the location of the inode in bno/off, for mapping it into a buffer.
+ * Return the location of the inode in imap, for mapping it into a buffer.
 */
-/*ARGSUSED*/
 int
-xfs_dilocate(
+xfs_imap(
-        xfs_mount_t     *mp,    /* file system mount structure */
+        xfs_mount_t      *mp,   /* file system mount structure */
-        xfs_trans_t     *tp,    /* transaction pointer */
+        xfs_trans_t      *tp,   /* transaction pointer */
        xfs_ino_t       ino,    /* inode to locate */
-        xfs_fsblock_t   *bno,   /* output: block containing inode */
+        struct xfs_imap *imap,  /* location map structure */
-        int             *len,   /* output: num blocks in inode cluster */
+        uint            flags)  /* flags for inode btree lookup */
-        int             *off,   /* output: index in block of inode */
-        uint            flags)  /* flags concerning inode lookup */
 {
        xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
-        xfs_buf_t       *agbp;  /* agi buffer */
        xfs_agino_t     agino;  /* inode number within alloc group */
        xfs_agnumber_t  agno;   /* allocation group number */
        int             blks_per_cluster; /* num blocks per inode cluster */
        xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
-        xfs_agino_t     chunk_agino;    /* first agino in inode chunk */
-        __int32_t       chunk_cnt;      /* count of free inodes in chunk */
-        xfs_inofree_t   chunk_free;     /* mask of free inodes in chunk */
        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
-        xfs_btree_cur_t *cur;   /* inode btree cursor */
        int             error;  /* error code */
-        int             i;      /* temp state */
        int             offset; /* index of inode in its buffer */
        int             offset_agbno;   /* blks from chunk start to inode */
        ASSERT(ino != NULLFSINO);
        /*
         * Split up the inode number into its parts.
         */
@@ -1198,24 +1228,24 @@ xfs_dilocate(
            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 #ifdef DEBUG
                /* no diagnostics for bulkstat, ino comes from userspace */
-                if (flags & XFS_IMAP_BULKSTAT)
+                if (flags & XFS_IGET_BULKSTAT)
                        return XFS_ERROR(EINVAL);
                if (agno >= mp->m_sb.sb_agcount) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                        "xfs_dilocate: agno (%d) >= "
+                                        "xfs_imap: agno (%d) >= "
                                        "mp->m_sb.sb_agcount (%d)",
                                        agno,  mp->m_sb.sb_agcount);
                }
                if (agbno >= mp->m_sb.sb_agblocks) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                        "xfs_dilocate: agbno (0x%llx) >= "
+                                        "xfs_imap: agbno (0x%llx) >= "
                                        "mp->m_sb.sb_agblocks (0x%lx)",
                                        (unsigned long long) agbno,
                                        (unsigned long) mp->m_sb.sb_agblocks);
                }
                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
                        xfs_fs_cmn_err(CE_ALERT, mp,
-                                        "xfs_dilocate: ino (0x%llx) != "
+                                        "xfs_imap: ino (0x%llx) != "
                                        "XFS_AGINO_TO_INO(mp, agno, agino) "
                                        "(0x%llx)",
                                        ino, XFS_AGINO_TO_INO(mp, agno, agino));
@@ -1224,65 +1254,89 @@ xfs_dilocate(
 #endif /* DEBUG */
                return XFS_ERROR(EINVAL);
        }
-        if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
-            !(flags & XFS_IMAP_LOOKUP)) {
+        /*
+         * If the inode cluster size is the same as the blocksize or
+         * smaller we get to the buffer by simple arithmetics.
+         */
+        if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
-                *bno = XFS_AGB_TO_FSB(mp, agno, agbno);
-                *off = offset;
+                imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                *len = 1;
+                imap->im_len = XFS_FSB_TO_BB(mp, 1);
+                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
                return 0;
        }
        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
-        if (*bno != NULLFSBLOCK) {
+        /*
+         * If we get a block number passed from bulkstat we can use it to
+         * find the buffer easily.
+         */
+        if (imap->im_blkno) {
                offset = XFS_INO_TO_OFFSET(mp, ino);
                ASSERT(offset < mp->m_sb.sb_inopblock);
-                cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
-                *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
+                cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
-                        offset;
+                offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
-                *len = blks_per_cluster;
+                imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
                return 0;
        }
+        /*
+         * If the inode chunks are aligned then use simple maths to
+         * find the location. Otherwise we have to do a btree
+         * lookup to find the location.
+         */
        if (mp->m_inoalign_mask) {
                offset_agbno = agbno & mp->m_inoalign_mask;
                chunk_agbno = agbno - offset_agbno;
        } else {
+                xfs_btree_cur_t *cur;   /* inode btree cursor */
+                xfs_agino_t     chunk_agino; /* first agino in inode chunk */
+                __int32_t       chunk_cnt; /* count of free inodes in chunk */
+                xfs_inofree_t   chunk_free; /* mask of free inodes in chunk */
+                xfs_buf_t       *agbp;  /* agi buffer */
+                int             i;      /* temp state */
                down_read(&mp->m_peraglock);
                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
                up_read(&mp->m_peraglock);
                if (error) {
-#ifdef DEBUG
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
                                        "xfs_ialloc_read_agi() returned "
                                        "error %d, agno %d",
                                        error, agno);
-#endif /* DEBUG */
                        return error;
                }
-                cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
-                        (xfs_inode_t *)0, 0);
+                cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-                if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+                error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
-#ifdef DEBUG
+                if (error) {
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_lookup_le() failed");
-#endif /* DEBUG */
                        goto error0;
                }
-                if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
-                                &chunk_free, &i))) {
+                error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
-#ifdef DEBUG
+                                &chunk_free, &i);
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                if (error) {
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
                        goto error0;
                }
                if (i == 0) {
 #ifdef DEBUG
-                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
 #endif /* DEBUG */
                        error = XFS_ERROR(EINVAL);
                }
+ error0:
                xfs_trans_brelse(tp, agbp);
                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
                if (error)
@@ -1290,19 +1344,35 @@ xfs_dilocate(
                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
                offset_agbno = agbno - chunk_agbno;
        }
        ASSERT(agbno >= chunk_agbno);
        cluster_agbno = chunk_agbno +
                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
        offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
                XFS_INO_TO_OFFSET(mp, ino);
-        *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
-        *off = offset;
+        imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
-        *len = blks_per_cluster;
+        imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+        imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+        /*
+         * If the inode number maps to a block outside the bounds
+         * of the file system then return NULL rather than calling
+         * read_buf and panicing when we get an error from the
+         * driver.
+         */
+        if ((imap->im_blkno + imap->im_len) >
+            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+                        (unsigned long long) imap->im_blkno,
+                        (unsigned long long) imap->im_len,
+                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+                return XFS_ERROR(EINVAL);
+        }
        return 0;
-error0:
-        xfs_trans_brelse(tp, agbp);
-        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
-        return error;
 }
 /*
@@ -1370,70 +1440,95 @@ xfs_ialloc_log_agi(
        xfs_trans_log_buf(tp, bp, first, last);
 }
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+        struct xfs_agi          *agi)
+{
+        int                     i;
+        for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+                ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
 /*
 * Read in the allocation group header (inode allocation section)
 */
 int
-xfs_ialloc_read_agi(
+xfs_read_agi(
-        xfs_mount_t     *mp,            /* file system mount structure */
+        struct xfs_mount        *mp,    /* file system mount structure */
-        xfs_trans_t     *tp,            /* transaction pointer */
+        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agnumber_t          agno,   /* allocation group number */
-        xfs_buf_t       **bpp)          /* allocation group hdr buf */
+        struct xfs_buf          **bpp)  /* allocation group hdr buf */
 {
-        xfs_agi_t       *agi;           /* allocation group header */
+        struct xfs_agi          *agi;   /* allocation group header */
-        int             agi_ok;         /* agi is consistent */
+        int                     agi_ok; /* agi is consistent */
-        xfs_buf_t       *bp;            /* allocation group hdr buf */
+        int                     error;
-        xfs_perag_t     *pag;           /* per allocation group data */
-        int             error;
        ASSERT(agno != NULLAGNUMBER);
-        error = xfs_trans_read_buf(
-                        mp, tp, mp->m_ddev_targp,
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                        XFS_FSS_TO_BB(mp, 1), 0, bpp);
        if (error)
                return error;
-        ASSERT(bp && !XFS_BUF_GETERROR(bp));
+        ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
+        agi = XFS_BUF_TO_AGI(*bpp);
        /*
         * Validate the magic number of the agi block.
         */
-        agi = XFS_BUF_TO_AGI(bp);
+        agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-        agi_ok =
+                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
-                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
+                be32_to_cpu(agi->agi_seqno) == agno;
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
                        XFS_RANDOM_IALLOC_READ_AGI))) {
-                XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
                                     mp, agi);
-                xfs_trans_brelse(tp, bp);
+                xfs_trans_brelse(tp, *bpp);
                return XFS_ERROR(EFSCORRUPTED);
        }
+        XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+        xfs_check_agi_unlinked(agi);
+        return 0;
+}
+int
+xfs_ialloc_read_agi(
+        struct xfs_mount        *mp,    /* file system mount structure */
+        struct xfs_trans        *tp,    /* transaction pointer */
+        xfs_agnumber_t          agno,   /* allocation group number */
+        struct xfs_buf          **bpp)  /* allocation group hdr buf */
+{
+        struct xfs_agi          *agi;   /* allocation group header */
+        struct xfs_perag        *pag;   /* per allocation group data */
+        int                     error;
+        error = xfs_read_agi(mp, tp, agno, bpp);
+        if (error)
+                return error;
+        agi = XFS_BUF_TO_AGI(*bpp);
        pag = &mp->m_perag[agno];
        if (!pag->pagi_init) {
                pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
                pag->pagi_count = be32_to_cpu(agi->agi_count);
                pag->pagi_init = 1;
-        } else {
-                /*
-                 * It's possible for these to be out of sync if
-                 * we are in the middle of a forced shutdown.
-                 */
-                ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-                        XFS_FORCED_SHUTDOWN(mp));
        }
-#ifdef DEBUG
+        /*
-        {
+         * It's possible for these to be out of sync if
-                int     i;
+         * we are in the middle of a forced shutdown.
+         */
-                for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+        ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
-                        ASSERT(agi->agi_unlinked[i]);
+                XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
-        XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
-        *bpp = bp;
        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 4e30ec1d13bc..50f558a4e0a8 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -20,6 +20,7 @@
 struct xfs_buf;
 struct xfs_dinode;
+struct xfs_imap;
 struct xfs_mount;
 struct xfs_trans;
@@ -56,7 +57,6 @@ static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
 }
-#ifdef __KERNEL__
 /*
 * Allocate an inode on disk.
 * Mode is used to tell whether the new inode will need space, and whether
@@ -105,17 +105,14 @@ xfs_difree(
        xfs_ino_t       *first_ino);    /* first inode in deleted cluster */
 /*
- * Return the location of the inode in bno/len/off,
+ * Return the location of the inode in imap, for mapping it into a buffer.
- * for mapping it into a buffer.
 */
 int
-xfs_dilocate(
+xfs_imap(
        struct xfs_mount *mp,           /* file system mount structure */
        struct xfs_trans *tp,           /* transaction pointer */
        xfs_ino_t       ino,            /* inode to locate */
-        xfs_fsblock_t   *bno,           /* output: block containing inode */
+        struct xfs_imap *imap,          /* location map structure */
-        int             *len,           /* output: num blocks in cluster*/
-        int             *off,           /* output: index in block of inode */
        uint            flags);         /* flags for inode btree lookup */
 /*
@@ -154,6 +151,24 @@ xfs_ialloc_pagi_init(
        struct xfs_trans *tp,           /* transaction pointer */
        xfs_agnumber_t  agno);          /* allocation group number */
-#endif  /* __KERNEL__ */
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
+                __int32_t fcnt, xfs_inofree_t free, int *stat);
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
+                __int32_t fcnt, xfs_inofree_t free, int *stat);
+/*
+ * Get the data from the pointed-to record.
+ */
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
+                             __int32_t *fcnt, xfs_inofree_t *free, int *stat);
 #endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 83502f3edef0..99f2408e8d8e 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -35,2044 +35,349 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
-STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
-                xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
-/*
+STATIC int
- * Single level of the xfs_inobt_delete record deletion routine.
+xfs_inobt_get_minrecs(
- * Delete record pointed to by cur/level.
+        struct xfs_btree_cur    *cur,
- * Remove the record from its block then rebalance the tree.
+        int                     level)
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int                              /* error */
-xfs_inobt_delrec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level removing record from */
-        int                     *stat)  /* fail/done/go-on */
 {
-        xfs_buf_t               *agbp;  /* buffer for a.g. inode header */
+        return cur->bc_mp->m_inobt_mnr[level != 0];
-        xfs_mount_t             *mp;    /* mount structure */
+}
-        xfs_agi_t               *agi;   /* allocation group inode header */
-        xfs_inobt_block_t       *block; /* btree block record/key lives in */
-        xfs_agblock_t           bno;    /* btree block number */
-        xfs_buf_t               *bp;    /* buffer for block */
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_inobt_key_t         key;    /* kp points here if block is level 0 */
-        xfs_inobt_key_t         *kp = NULL;     /* pointer to btree keys */
-        xfs_agblock_t           lbno;   /* left block's block number */
-        xfs_buf_t               *lbp;   /* left block's buffer pointer */
-        xfs_inobt_block_t       *left;  /* left btree block */
-        xfs_inobt_key_t         *lkp;   /* left block key pointer */
-        xfs_inobt_ptr_t         *lpp;   /* left block address pointer */
-        int                     lrecs = 0;      /* number of records in left block */
-        xfs_inobt_rec_t         *lrp;   /* left block record pointer */
-        xfs_inobt_ptr_t         *pp = NULL;     /* pointer to btree addresses */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_agblock_t           rbno;   /* right block's block number */
-        xfs_buf_t               *rbp;   /* right block's buffer pointer */
-        xfs_inobt_block_t       *right; /* right btree block */
-        xfs_inobt_key_t         *rkp;   /* right block key pointer */
-        xfs_inobt_rec_t         *rp;    /* pointer to btree records */
-        xfs_inobt_ptr_t         *rpp;   /* right block address pointer */
-        int                     rrecs = 0;      /* number of records in right block */
-        int                     numrecs;
-        xfs_inobt_rec_t         *rrp;   /* right block record pointer */
-        xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-        mp = cur->bc_mp;
-        /*
-         * Get the index of the entry being deleted, check for nothing there.
-         */
-        ptr = cur->bc_ptrs[level];
-        if (ptr == 0) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Get the buffer & block containing the record or key/ptr.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-#endif
-        /*
-         * Fail if we're off the end of the block.
-         */
-        numrecs = be16_to_cpu(block->bb_numrecs);
+STATIC struct xfs_btree_cur *
-        if (ptr > numrecs) {
+xfs_inobt_dup_cursor(
-                *stat = 0;
+        struct xfs_btree_cur    *cur)
-                return 0;
+{
-        }
+        return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
-        /*
+                        cur->bc_private.a.agbp, cur->bc_private.a.agno);
-         * It's a nonleaf.  Excise the key and ptr being deleted, by
+}
-         * sliding the entries past them down one.
-         * Log the changed areas of the block.
-         */
-        if (level > 0) {
-                kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = ptr; i < numrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level)))
-                                return error;
-                }
-#endif
-                if (ptr < numrecs) {
-                        memmove(&kp[ptr - 1], &kp[ptr],
-                                (numrecs - ptr) * sizeof(*kp));
-                        memmove(&pp[ptr - 1], &pp[ptr],
-                                (numrecs - ptr) * sizeof(*kp));
-                        xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
-                        xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
-                }
-        }
-        /*
-         * It's a leaf.  Excise the record being deleted, by sliding the
-         * entries past it down one.  Log the changed areas of the block.
-         */
-        else {
-                rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-                if (ptr < numrecs) {
-                        memmove(&rp[ptr - 1], &rp[ptr],
-                                (numrecs - ptr) * sizeof(*rp));
-                        xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
-                }
-                /*
-                 * If it's the first record in the block, we'll need a key
-                 * structure to pass up to the next level (updkey).
-                 */
-                if (ptr == 1) {
-                        key.ir_startino = rp->ir_startino;
-                        kp = &key;
-                }
-        }
-        /*
-         * Decrement and log the number of entries in the block.
-         */
-        numrecs--;
-        block->bb_numrecs = cpu_to_be16(numrecs);
-        xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-        /*
-         * Is this the root level?  If so, we're almost done.
-         */
-        if (level == cur->bc_nlevels - 1) {
-                /*
-                 * If this is the root level,
-                 * and there's only one entry left,
-                 * and it's NOT the leaf level,
-                 * then we can get rid of this level.
-                 */
-                if (numrecs == 1 && level > 0) {
-                        agbp = cur->bc_private.a.agbp;
-                        agi = XFS_BUF_TO_AGI(agbp);
-                        /*
-                         * pp is still set to the first pointer in the block.
-                         * Make it the new root of the btree.
-                         */
-                        bno = be32_to_cpu(agi->agi_root);
-                        agi->agi_root = *pp;
-                        be32_add_cpu(&agi->agi_level, -1);
-                        /*
-                         * Free the block.
-                         */
-                        if ((error = xfs_free_extent(cur->bc_tp,
-                                XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
-                                return error;
-                        xfs_trans_binval(cur->bc_tp, bp);
-                        xfs_ialloc_log_agi(cur->bc_tp, agbp,
-                                XFS_AGI_ROOT | XFS_AGI_LEVEL);
-                        /*
-                         * Update the cursor so there's one fewer level.
-                         */
-                        cur->bc_bufs[level] = NULL;
-                        cur->bc_nlevels--;
-                } else if (level > 0 &&
-                           (error = xfs_inobt_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we deleted the leftmost entry in the block, update the
-         * key values above us in the tree.
-         */
-        if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
-                return error;
-        /*
-         * If the number of records remaining in the block is at least
-         * the minimum, we're done.
-         */
-        if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                if (level > 0 &&
-                    (error = xfs_inobt_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Otherwise, we have to move some records around to keep the
-         * tree balanced.  Look at the left and right sibling blocks to
-         * see if we can re-balance by moving only one record.
-         */
-        rbno = be32_to_cpu(block->bb_rightsib);
-        lbno = be32_to_cpu(block->bb_leftsib);
-        bno = NULLAGBLOCK;
-        ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-        /*
-         * Duplicate the cursor so our btree manipulations here won't
-         * disrupt the next level up.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        /*
-         * If there's a right sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (rbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the last entry in the next block.
-                 * Actually any entry but the first would suffice.
-                 */
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                if ((error = xfs_inobt_increment(tcur, level, &i)))
-                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                rbp = tcur->bc_bufs[level];
-                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(right->bb_leftsib);
-                /*
-                 * If right block is full enough so that removing one entry
-                 * won't make it too empty, and left-shifting an entry out
-                 * of right to us works, we're done.
-                 */
-                if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_inobt_lshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_INOBT_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level > 0 &&
-                                    (error = xfs_inobt_decrement(cur, level,
-                                                &i)))
-                                        return error;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference, and fix up the temp cursor to point
-                 * to our block again (last record).
-                 */
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if (lbno != NULLAGBLOCK) {
-                        xfs_btree_firstrec(tcur, level);
-                        if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                                goto error0;
-                }
-        }
-        /*
-         * If there's a left sibling, see if it's ok to shift an entry
-         * out of it.
-         */
-        if (lbno != NULLAGBLOCK) {
-                /*
-                 * Move the temp cursor to the first entry in the
-                 * previous block.
-                 */
-                xfs_btree_firstrec(tcur, level);
-                if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                        goto error0;
-                xfs_btree_firstrec(tcur, level);
-                /*
-                 * Grab a pointer to the block.
-                 */
-                lbp = tcur->bc_bufs[level];
-                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        goto error0;
-#endif
-                /*
-                 * Grab the current block number, for future use.
-                 */
-                bno = be32_to_cpu(left->bb_rightsib);
-                /*
-                 * If left block is full enough so that removing one entry
-                 * won't make it too empty, and right-shifting an entry out
-                 * of left to us works, we're done.
-                 */
-                if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                     XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                        if ((error = xfs_inobt_rshift(tcur, level, &i)))
-                                goto error0;
-                        if (i) {
-                                ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                       XFS_INOBT_BLOCK_MINRECS(level, cur));
-                                xfs_btree_del_cursor(tcur,
-                                                     XFS_BTREE_NOERROR);
-                                if (level == 0)
-                                        cur->bc_ptrs[0]++;
-                                *stat = 1;
-                                return 0;
-                        }
-                }
-                /*
-                 * Otherwise, grab the number of records in right for
-                 * future reference.
-                 */
-                lrecs = be16_to_cpu(left->bb_numrecs);
-        }
-        /*
-         * Delete the temp cursor, we're done with it.
-         */
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        /*
-         * If here, we need to do a join to keep the tree balanced.
-         */
-        ASSERT(bno != NULLAGBLOCK);
-        /*
-         * See if we can join with the left neighbor block.
-         */
-        if (lbno != NULLAGBLOCK &&
-            lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "right" to be the starting block,
-                 * "left" to be the left neighbor.
-                 */
-                rbno = bno;
-                right = block;
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                rbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, lbno, 0, &lbp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                        return error;
-        }
-        /*
-         * If that won't work, see if we can join with the right neighbor block.
-         */
-        else if (rbno != NULLAGBLOCK &&
-                 rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * Set "left" to be the starting block,
-                 * "right" to be the right neighbor.
-                 */
-                lbno = bno;
-                left = block;
-                lrecs = be16_to_cpu(left->bb_numrecs);
-                lbp = bp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                cur->bc_private.a.agno, rbno, 0, &rbp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-                rrecs = be16_to_cpu(right->bb_numrecs);
-                if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                        return error;
-        }
-        /*
-         * Otherwise, we can't fix the imbalance.
-         * Just return.  This is probably a logic error, but it's not fatal.
-         */
-        else {
-                if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
-                        return error;
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * We're now going to join "left" and "right" by moving all the stuff
-         * in "right" to "left" and deleting "right".
-         */
-        if (level > 0) {
-                /*
-                 * It's a non-leaf.  Move keys and pointers.
-                 */
-                lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
-                lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < rrecs; i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-                memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-                xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-                xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        } else {
-                /*
-                 * It's a leaf.  Move records.
-                 */
-                lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-                xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-        }
-        /*
-         * If we joined with the left neighbor, set the buffer in the
-         * cursor to the left block, and fix up the index.
-         */
-        if (bp != lbp) {
-                xfs_btree_setbuf(cur, level, lbp);
-                cur->bc_ptrs[level] += lrecs;
-        }
-        /*
-         * If we joined with the right neighbor and there's a level above
-         * us, increment the cursor at that level.
-         */
-        else if (level + 1 < cur->bc_nlevels &&
-                 (error = xfs_alloc_increment(cur, level + 1, &i)))
-                return error;
-        /*
-         * Fix up the number of records in the surviving block.
-         */
-        lrecs += rrecs;
-        left->bb_numrecs = cpu_to_be16(lrecs);
-        /*
-         * Fix up the right block pointer in the surviving block, and log it.
-         */
-        left->bb_rightsib = right->bb_rightsib;
-        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there is a right sibling now, make it point to the
-         * remaining block.
-         */
-        if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-                xfs_inobt_block_t       *rrblock;
-                xfs_buf_t               *rrbp;
-                if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
+STATIC void
-                                cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
+xfs_inobt_set_root(
-                                &rrbp, XFS_INO_BTREE_REF)))
+        struct xfs_btree_cur    *cur,
-                        return error;
+        union xfs_btree_ptr     *nptr,
-                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
+        int                     inc)    /* level change */
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
+{
-                        return error;
+        struct xfs_buf          *agbp = cur->bc_private.a.agbp;
-                rrblock->bb_leftsib = cpu_to_be32(lbno);
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-                xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-        }
-        /*
-         * Free the deleting block.
-         */
-        if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
-                                     cur->bc_private.a.agno, rbno), 1)))
-                return error;
-        xfs_trans_binval(cur->bc_tp, rbp);
-        /*
-         * Readjust the ptr at this level if it's not a leaf, since it's
-         * still pointing at the deletion point, which makes the cursor
-         * inconsistent.  If this makes the ptr 0, the caller fixes it up.
-         * We can't use decrement because it would change the next level up.
-         */
-        if (level > 0)
-                cur->bc_ptrs[level]--;
-        /*
-         * Return value means the next level up has something to do.
-         */
-        *stat = 2;
-        return 0;
-error0:
+        agi->agi_root = nptr->s;
-        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+        be32_add_cpu(&agi->agi_level, inc);
-        return error;
+        xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
 }
-/*
+STATIC int
- * Insert one record/level.  Return information to the caller
+xfs_inobt_alloc_block(
- * allowing the next level up to proceed if necessary.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_ptr     *start,
-STATIC int                              /* error */
+        union xfs_btree_ptr     *new,
-xfs_inobt_insrec(
+        int                     length,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     *stat)
-        int                     level,  /* level to insert record at */
-        xfs_agblock_t           *bnop,  /* i/o: block number inserted */
-        xfs_inobt_rec_t         *recp,  /* i/o: record data inserted */
-        xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
-        int                     *stat)  /* success/failure */
 {
-        xfs_inobt_block_t       *block; /* btree block record/key lives in */
+        xfs_alloc_arg_t         args;           /* block allocation args */
-        xfs_buf_t               *bp;    /* buffer for block */
+        int                     error;          /* error return value */
-        int                     error;  /* error return value */
+        xfs_agblock_t           sbno = be32_to_cpu(start->s);
-        int                     i;      /* loop index */
-        xfs_inobt_key_t         key;    /* key value being inserted */
-        xfs_inobt_key_t         *kp=NULL;       /* pointer to btree keys */
-        xfs_agblock_t           nbno;   /* block number of allocated block */
-        xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-        xfs_inobt_key_t         nkey;   /* new key value, from split */
-        xfs_inobt_rec_t         nrec;   /* new record value, for caller */
-        int                     numrecs;
-        int                     optr;   /* old ptr value */
-        xfs_inobt_ptr_t         *pp;    /* pointer to btree addresses */
-        int                     ptr;    /* index in btree block for this rec */
-        xfs_inobt_rec_t         *rp=NULL;       /* pointer to btree records */
-        /*
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
-         * GCC doesn't understand the (arguably complex) control flow in
-         * this function and complains about uninitialized structure fields
-         * without this.
-         */
-        memset(&nrec, 0, sizeof(nrec));
-        /*
+        memset(&args, 0, sizeof(args));
-         * If we made it to the root level, allocate a new root block
+        args.tp = cur->bc_tp;
-         * and we're done.
+        args.mp = cur->bc_mp;
-         */
+        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
-        if (level >= cur->bc_nlevels) {
+        args.minlen = 1;
-                error = xfs_inobt_newroot(cur, &i);
+        args.maxlen = 1;
-                *bnop = NULLAGBLOCK;
+        args.prod = 1;
-                *stat = i;
+        args.type = XFS_ALLOCTYPE_NEAR_BNO;
+        error = xfs_alloc_vextent(&args);
+        if (error) {
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
        }
-        /*
+        if (args.fsbno == NULLFSBLOCK) {
-         * Make a key out of the record data to be inserted, and save it.
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-         */
-        key.ir_startino = recp->ir_startino;
-        optr = ptr = cur->bc_ptrs[level];
-        /*
-         * If we're off the left edge, return failure.
-         */
-        if (ptr == 0) {
                *stat = 0;
                return 0;
        }
-        /*
+        ASSERT(args.len == 1);
-         * Get pointers to the btree buffer and block.
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-         */
-        bp = cur->bc_bufs[level];
+        new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-        numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-        /*
-         * Check that the new entry is being inserted in the right place.
-         */
-        if (ptr <= numrecs) {
-                if (level == 0) {
-                        rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-                        xfs_btree_check_rec(cur->bc_btnum, recp, rp);
-                } else {
-                        kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-                        xfs_btree_check_key(cur->bc_btnum, &key, kp);
-                }
-        }
-#endif
-        nbno = NULLAGBLOCK;
-        ncur = NULL;
-        /*
-         * If the block is full, we can't insert the new entry until we
-         * make the block un-full.
-         */
-        if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                /*
-                 * First, try shifting an entry to the right neighbor.
-                 */
-                if ((error = xfs_inobt_rshift(cur, level, &i)))
-                        return error;
-                if (i) {
-                        /* nothing */
-                }
-                /*
-                 * Next, try shifting an entry to the left neighbor.
-                 */
-                else {
-                        if ((error = xfs_inobt_lshift(cur, level, &i)))
-                                return error;
-                        if (i) {
-                                optr = ptr = cur->bc_ptrs[level];
-                        } else {
-                                /*
-                                 * Next, try splitting the current block
-                                 * in half. If this works we have to
-                                 * re-set our variables because
-                                 * we could be in a different block now.
-                                 */
-                                if ((error = xfs_inobt_split(cur, level, &nbno,
-                                                &nkey, &ncur, &i)))
-                                        return error;
-                                if (i) {
-                                        bp = cur->bc_bufs[level];
-                                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-                                        if ((error = xfs_btree_check_sblock(cur,
-                                                        block, level, bp)))
-                                                return error;
-#endif
-                                        ptr = cur->bc_ptrs[level];
-                                        nrec.ir_startino = nkey.ir_startino;
-                                } else {
-                                        /*
-                                         * Otherwise the insert fails.
-                                         */
-                                        *stat = 0;
-                                        return 0;
-                                }
-                        }
-                }
-        }
-        /*
-         * At this point we know there's room for our new entry in the block
-         * we're pointing at.
-         */
-        numrecs = be16_to_cpu(block->bb_numrecs);
-        if (level > 0) {
-                /*
-                 * It's a non-leaf entry.  Make a hole for the new data
-                 * in the key and ptr regions of the block.
-                 */
-                kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-                for (i = numrecs; i >= ptr; i--) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                                return error;
-                }
-#endif
-                memmove(&kp[ptr], &kp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*kp));
-                memmove(&pp[ptr], &pp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*pp));
-                /*
-                 * Now stuff the new data in, bump numrecs and log the new data.
-                 */
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                        return error;
-#endif
-                kp[ptr - 1] = key;
-                pp[ptr - 1] = cpu_to_be32(*bnop);
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_inobt_log_keys(cur, bp, ptr, numrecs);
-                xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
-        } else {
-                /*
-                 * It's a leaf entry.  Make a hole for the new record.
-                 */
-                rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-                memmove(&rp[ptr], &rp[ptr - 1],
-                        (numrecs - ptr + 1) * sizeof(*rp));
-                /*
-                 * Now stuff the new record in, bump numrecs
-                 * and log the new data.
-                 */
-                rp[ptr - 1] = *recp;
-                numrecs++;
-                block->bb_numrecs = cpu_to_be16(numrecs);
-                xfs_inobt_log_recs(cur, bp, ptr, numrecs);
-        }
-        /*
-         * Log the new number of records in the btree header.
-         */
-        xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        /*
-         * Check that the key/record is in the right place, now.
-         */
-        if (ptr < numrecs) {
-                if (level == 0)
-                        xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                                rp + ptr);
-                else
-                        xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                                kp + ptr);
-        }
-#endif
-        /*
-         * If we inserted at the start of a block, update the parents' keys.
-         */
-        if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
-                return error;
-        /*
-         * Return the new block number, if any.
-         * If there is one, give back a record value and a cursor too.
-         */
-        *bnop = nbno;
-        if (nbno != NULLAGBLOCK) {
-                *recp = nrec;
-                *curp = ncur;
-        }
        *stat = 1;
        return 0;
 }
-/*
+STATIC int
- * Log header fields from a btree block.
+xfs_inobt_free_block(
- */
+        struct xfs_btree_cur    *cur,
-STATIC void
+        struct xfs_buf          *bp)
-xfs_inobt_log_block(
-        xfs_trans_t             *tp,    /* transaction pointer */
-        xfs_buf_t               *bp,    /* buffer containing btree block */
-        int                     fields) /* mask of fields: XFS_BB_... */
 {
-        int                     first;  /* first byte offset logged */
+        xfs_fsblock_t           fsbno;
-        int                     last;   /* last byte offset logged */
+        int                     error;
-        static const short      offsets[] = {   /* table of offsets */
-                offsetof(xfs_inobt_block_t, bb_magic),
-                offsetof(xfs_inobt_block_t, bb_level),
-                offsetof(xfs_inobt_block_t, bb_numrecs),
-                offsetof(xfs_inobt_block_t, bb_leftsib),
-                offsetof(xfs_inobt_block_t, bb_rightsib),
-                sizeof(xfs_inobt_block_t)
-        };
-        xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
+        fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
-        xfs_trans_log_buf(tp, bp, first, last);
+        error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+        if (error)
+                return error;
+        xfs_trans_binval(cur->bc_tp, bp);
+        return error;
 }
-/*
+STATIC int
- * Log keys from a btree block (nonleaf).
+xfs_inobt_get_maxrecs(
- */
+        struct xfs_btree_cur    *cur,
-STATIC void
+        int                     level)
-xfs_inobt_log_keys(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_buf_t               *bp,    /* buffer containing btree block */
-        int                     kfirst, /* index of first key to log */
-        int                     klast)  /* index of last key to log */
 {
-        xfs_inobt_block_t       *block; /* btree block to log from */
+        return cur->bc_mp->m_inobt_mxr[level != 0];
-        int                     first;  /* first byte offset logged */
-        xfs_inobt_key_t         *kp;    /* key pointer in btree block */
-        int                     last;   /* last byte offset logged */
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-        kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-        first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
 STATIC void
-xfs_inobt_log_ptrs(
+xfs_inobt_init_key_from_rec(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     pfirst, /* index of first pointer to log */
-        int                     plast)  /* index of last pointer to log */
 {
-        xfs_inobt_block_t       *block; /* btree block to log from */
+        key->inobt.ir_startino = rec->inobt.ir_startino;
-        int                     first;  /* first byte offset logged */
-        int                     last;   /* last byte offset logged */
-        xfs_inobt_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-        pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-        first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-        last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
 }
-/*
- * Log records from a btree block (leaf).
- */
 STATIC void
-xfs_inobt_log_recs(
+xfs_inobt_init_rec_from_key(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_key     *key,
-        xfs_buf_t               *bp,    /* buffer containing btree block */
+        union xfs_btree_rec     *rec)
-        int                     rfirst, /* index of first record to log */
-        int                     rlast)  /* index of last record to log */
 {
-        xfs_inobt_block_t       *block; /* btree block to log from */
+        rec->inobt.ir_startino = key->inobt.ir_startino;
-        int                     first;  /* first byte offset logged */
+}
-        int                     last;   /* last byte offset logged */
-        xfs_inobt_rec_t         *rp;    /* record pointer for btree block */
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
+STATIC void
-        rp = XFS_INOBT_REC_ADDR(block, 1, cur);
+xfs_inobt_init_rec_from_cur(
-        first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
+        struct xfs_btree_cur    *cur,
-        last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
+        union xfs_btree_rec     *rec)
-        xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+{
+        rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+        rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+        rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
 }
 /*
- * Lookup the record.  The cursor is made to point to it, based on dir.
+ * intial value of ptr for lookup
- * Return 0 if can't find any such record, 1 for success.
 */
-STATIC int                              /* error */
+STATIC void
-xfs_inobt_lookup(
+xfs_inobt_init_ptr_from_cur(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_btree_cur    *cur,
-        xfs_lookup_t            dir,    /* <=, ==, or >= */
+        union xfs_btree_ptr     *ptr)
-        int                     *stat)  /* success/failure */
 {
-        xfs_agblock_t           agbno;  /* a.g. relative btree block number */
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-        xfs_agnumber_t          agno;   /* allocation group number */
-        xfs_inobt_block_t       *block=NULL;    /* current btree block */
-        __int64_t               diff;   /* difference for the current key */
-        int                     error;  /* error return value */
-        int                     keyno=0;        /* current key number */
-        int                     level;  /* level in the btree */
-        xfs_mount_t             *mp;    /* file system mount point */
-        /*
-         * Get the allocation group header, and the root block number.
-         */
-        mp = cur->bc_mp;
-        {
-                xfs_agi_t       *agi;   /* a.g. inode header */
-                agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-                agno = be32_to_cpu(agi->agi_seqno);
-                agbno = be32_to_cpu(agi->agi_root);
-        }
-        /*
-         * Iterate over each level in the btree, starting at the root.
-         * For each level above the leaves, find the key we need, based
-         * on the lookup record, then follow the corresponding block
-         * pointer down to the next level.
-         */
-        for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-                xfs_buf_t       *bp;    /* buffer pointer for btree block */
-                xfs_daddr_t     d;      /* disk address of btree block */
-                /*
-                 * Get the disk address we're looking for.
-                 */
-                d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-                /*
-                 * If the old buffer at this level is for a different block,
-                 * throw it away, otherwise just use it.
-                 */
-                bp = cur->bc_bufs[level];
-                if (bp && XFS_BUF_ADDR(bp) != d)
-                        bp = NULL;
-                if (!bp) {
-                        /*
-                         * Need to get a new buffer.  Read it, then
-                         * set it in the cursor, releasing the old one.
-                         */
-                        if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                        agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
-                                return error;
-                        xfs_btree_setbuf(cur, level, bp);
-                        /*
-                         * Point to the btree block, now that we have the buffer
-                         */
-                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                        if ((error = xfs_btree_check_sblock(cur, block, level,
-                                        bp)))
-                                return error;
-                } else
-                        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                /*
-                 * If we already had a key match at a higher level, we know
-                 * we need to use the first entry in this block.
-                 */
-                if (diff == 0)
-                        keyno = 1;
-                /*
-                 * Otherwise we need to search this block.  Do a binary search.
-                 */
-                else {
-                        int             high;   /* high entry number */
-                        xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
-                        xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
-                        int             low;    /* low entry number */
-                        /*
+        ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
-                         * Get a pointer to keys or records.
-                         */
-                        if (level > 0)
-                                kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                        else
-                                krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
-                        /*
-                         * Set low and high entry numbers, 1-based.
-                         */
-                        low = 1;
-                        if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                                /*
-                                 * If the block is empty, the tree must
-                                 * be an empty leaf.
-                                 */
-                                ASSERT(level == 0 && cur->bc_nlevels == 1);
-                                cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                                *stat = 0;
-                                return 0;
-                        }
-                        /*
-                         * Binary search the block.
-                         */
-                        while (low <= high) {
-                                xfs_agino_t     startino;       /* key value */
-                                /*
-                                 * keyno is average of low and high.
-                                 */
-                                keyno = (low + high) >> 1;
-                                /*
-                                 * Get startino.
-                                 */
-                                if (level > 0) {
-                                        xfs_inobt_key_t *kkp;
-                                        kkp = kkbase + keyno - 1;
-                                        startino = be32_to_cpu(kkp->ir_startino);
-                                } else {
-                                        xfs_inobt_rec_t *krp;
-                                        krp = krbase + keyno - 1;
-                                        startino = be32_to_cpu(krp->ir_startino);
-                                }
-                                /*
-                                 * Compute difference to get next direction.
-                                 */
-                                diff = (__int64_t)
-                                        startino - cur->bc_rec.i.ir_startino;
-                                /*
-                                 * Less than, move right.
-                                 */
-                                if (diff < 0)
-                                        low = keyno + 1;
-                                /*
-                                 * Greater than, move left.
-                                 */
-                                else if (diff > 0)
-                                        high = keyno - 1;
-                                /*
-                                 * Equal, we're done.
-                                 */
-                                else
-                                        break;
-                        }
-                }
-                /*
-                 * If there are more levels, set up for the next level
-                 * by getting the block number and filling in the cursor.
-                 */
-                if (level > 0) {
-                        /*
-                         * If we moved left, need the previous key number,
-                         * unless there isn't one.
-                         */
-                        if (diff > 0 && --keyno < 1)
-                                keyno = 1;
-                        agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                        if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                                return error;
-#endif
-                        cur->bc_ptrs[level] = keyno;
-                }
-        }
-        /*
-         * Done with the search.
-         * See if we need to adjust the results.
-         */
-        if (dir != XFS_LOOKUP_LE && diff < 0) {
-                keyno++;
-                /*
-                 * If ge search and we went off the end of the block, but it's
-                 * not the last block, we're in the wrong block.
-                 */
-                if (dir == XFS_LOOKUP_GE &&
-                    keyno > be16_to_cpu(block->bb_numrecs) &&
-                    be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                        int     i;
-                        cur->bc_ptrs[0] = keyno;
+        ptr->s = agi->agi_root;
-                        if ((error = xfs_inobt_increment(cur, 0, &i)))
-                                return error;
-                        ASSERT(i == 1);
-                        *stat = 1;
-                        return 0;
-                }
-        }
-        else if (dir == XFS_LOOKUP_LE && diff > 0)
-                keyno--;
-        cur->bc_ptrs[0] = keyno;
-        /*
-         * Return if we succeeded or not.
-         */
-        if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-                *stat = 0;
-        else
-                *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-        return 0;
 }
-/*
+STATIC __int64_t
- * Move 1 record left from cur/level if possible.
+xfs_inobt_key_diff(
- * Update cur to reflect the new path.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_key     *key)
-STATIC int                              /* error */
-xfs_inobt_lshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
 {
-        int                     error;  /* error return value */
+        return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
-#ifdef DEBUG
+                          cur->bc_rec.i.ir_startino;
-        int                     i;      /* loop index */
-#endif
-        xfs_inobt_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-        xfs_inobt_block_t       *left;  /* left neighbor btree block */
-        xfs_inobt_key_t         *lkp=NULL;      /* key pointer for left block */
-        xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-        xfs_inobt_rec_t         *lrp=NULL;      /* record pointer for left block */
-        int                     nrec;   /* new number of left block entries */
-        xfs_buf_t               *rbp;   /* buffer for right (current) block */
-        xfs_inobt_block_t       *right; /* right (current) btree block */
-        xfs_inobt_key_t         *rkp=NULL;      /* key pointer for right block */
-        xfs_inobt_ptr_t         *rpp=NULL;      /* address pointer for right block */
-        xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-        /*
-         * Set up variables for this block as "right".
-         */
-        rbp = cur->bc_bufs[level];
-        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-#endif
-        /*
-         * If we've got no left sibling then we can't shift an entry left.
-         */
-        if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] <= 1) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the left neighbor as "left".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                        0, &lbp, XFS_INO_BTREE_REF)))
-                return error;
-        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
-        }
-        nrec = be16_to_cpu(left->bb_numrecs) + 1;
-        /*
-         * If non-leaf, copy a key and a ptr to the left block.
-         */
-        if (level > 0) {
-                lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                *lkp = *rkp;
-                xfs_inobt_log_keys(cur, lbp, nrec, nrec);
-                lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                        return error;
-#endif
-                *lpp = *rpp;
-                xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
-        }
-        /*
-         * If leaf, copy a record to the left block.
-         */
-        else {
-                lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                *lrp = *rrp;
-                xfs_inobt_log_recs(cur, lbp, nrec, nrec);
-        }
-        /*
-         * Bump and log left's numrecs, decrement and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, 1);
-        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-        else
-                xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-#endif
-        be16_add_cpu(&right->bb_numrecs, -1);
-        xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Slide the contents of right down one entry.
-         */
-        if (level > 0) {
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                        level)))
-                                return error;
-                }
-#endif
-                memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-        } else {
-                memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                key.ir_startino = rrp->ir_startino;
-                rkp = &key;
-        }
-        /*
-         * Update the parent key values of right.
-         */
-        if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
-                return error;
-        /*
-         * Slide the cursor value left one.
-         */
-        cur->bc_ptrs[level]--;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Allocate a new root block, fill it in.
+xfs_inobt_kill_root(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        struct xfs_buf          *bp,
-xfs_inobt_newroot(
+        int                     level,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        union xfs_btree_ptr     *newroot)
-        int                     *stat)  /* success/failure */
 {
-        xfs_agi_t               *agi;   /* a.g. inode header */
+        int                     error;
-        xfs_alloc_arg_t         args;   /* allocation argument structure */
-        xfs_inobt_block_t       *block; /* one half of the old root block */
-        xfs_buf_t               *bp;    /* buffer containing block */
-        int                     error;  /* error return value */
-        xfs_inobt_key_t         *kp;    /* btree key pointer */
-        xfs_agblock_t           lbno;   /* left block number */
-        xfs_buf_t               *lbp;   /* left buffer pointer */
-        xfs_inobt_block_t       *left;  /* left btree block */
-        xfs_buf_t               *nbp;   /* new (root) buffer */
-        xfs_inobt_block_t       *new;   /* new (root) btree block */
-        int                     nptr;   /* new value for key index, 1 or 2 */
-        xfs_inobt_ptr_t         *pp;    /* btree address pointer */
-        xfs_agblock_t           rbno;   /* right block number */
-        xfs_buf_t               *rbp;   /* right buffer pointer */
-        xfs_inobt_block_t       *right; /* right btree block */
-        xfs_inobt_rec_t         *rp;    /* btree record pointer */
-        ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+        XFS_BTREE_STATS_INC(cur, killroot);
        /*
-         * Get a block & a buffer.
+         * Update the root pointer, decreasing the level by 1 and then
+         * free the old root.
         */
-        agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+        xfs_inobt_set_root(cur, newroot, -1);
-        args.tp = cur->bc_tp;
+        error = xfs_inobt_free_block(cur, bp);
-        args.mp = cur->bc_mp;
+        if (error) {
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
+                XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
-                be32_to_cpu(agi->agi_root));
-        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-                args.isfl = args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        if ((error = xfs_alloc_vextent(&args)))
                return error;
-        /*
-         * None available, we fail.
-         */
-        if (args.fsbno == NULLFSBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-        new = XFS_BUF_TO_INOBT_BLOCK(nbp);
-        /*
-         * Set the root data in the a.g. inode structure.
-         */
-        agi->agi_root = cpu_to_be32(args.agbno);
-        be32_add_cpu(&agi->agi_level, 1);
-        xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
-                XFS_AGI_ROOT | XFS_AGI_LEVEL);
-        /*
-         * At the previous root level there are now two blocks: the old
-         * root, and the new block generated when it was split.
-         * We don't know which one the cursor is pointing at, so we
-         * set up variables "left" and "right" for each case.
-         */
-        bp = cur->bc_bufs[cur->bc_nlevels - 1];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
-                return error;
-#endif
-        if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                /*
-                 * Our block is left, pick up the right block.
-                 */
-                lbp = bp;
-                lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-                left = block;
-                rbno = be32_to_cpu(left->bb_rightsib);
-                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                                rbno, 0, &rbp, XFS_INO_BTREE_REF)))
-                        return error;
-                bp = rbp;
-                right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-                if ((error = xfs_btree_check_sblock(cur, right,
-                                cur->bc_nlevels - 1, rbp)))
-                        return error;
-                nptr = 1;
-        } else {
-                /*
-                 * Our block is right, pick up the left block.
-                 */
-                rbp = bp;
-                rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
-                right = block;
-                lbno = be32_to_cpu(right->bb_leftsib);
-                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                                lbno, 0, &lbp, XFS_INO_BTREE_REF)))
-                        return error;
-                bp = lbp;
-                left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-                if ((error = xfs_btree_check_sblock(cur, left,
-                                cur->bc_nlevels - 1, lbp)))
-                        return error;
-                nptr = 2;
-        }
-        /*
-         * Fill in the new block's btree header and log it.
-         */
-        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-        new->bb_level = cpu_to_be16(cur->bc_nlevels);
-        new->bb_numrecs = cpu_to_be16(2);
-        new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-        new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-        xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
-        ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-        /*
-         * Fill in the key data in the new root.
-         */
-        kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
-        if (be16_to_cpu(left->bb_level) > 0) {
-                kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur);
-                kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur);
-        } else {
-                rp = XFS_INOBT_REC_ADDR(left, 1, cur);
-                kp[0].ir_startino = rp->ir_startino;
-                rp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                kp[1].ir_startino = rp->ir_startino;
        }
-        xfs_inobt_log_keys(cur, nbp, 1, 2);
-        /*
-         * Fill in the pointer data in the new root.
-         */
-        pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
-        pp[0] = cpu_to_be32(lbno);
-        pp[1] = cpu_to_be32(rbno);
-        xfs_inobt_log_ptrs(cur, nbp, 1, 2);
-        /*
-         * Fix up the cursor.
-         */
-        xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-        cur->bc_ptrs[cur->bc_nlevels] = nptr;
-        cur->bc_nlevels++;
-        *stat = 1;
-        return 0;
-}
-/*
+        XFS_BTREE_STATS_INC(cur, free);
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                              /* error */
-xfs_inobt_rshift(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to shift record on */
-        int                     *stat)  /* success/failure */
-{
-        int                     error;  /* error return value */
-        int                     i;      /* loop index */
-        xfs_inobt_key_t         key;    /* key value for leaf level upward */
-        xfs_buf_t               *lbp;   /* buffer for left (current) block */
-        xfs_inobt_block_t       *left;  /* left (current) btree block */
-        xfs_inobt_key_t         *lkp;   /* key pointer for left block */
-        xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-        xfs_inobt_rec_t         *lrp;   /* record pointer for left block */
-        xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-        xfs_inobt_block_t       *right; /* right neighbor btree block */
-        xfs_inobt_key_t         *rkp;   /* key pointer for right block */
-        xfs_inobt_ptr_t         *rpp;   /* address pointer for right block */
-        xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-        xfs_btree_cur_t         *tcur;  /* temporary cursor */
-        /*
+        cur->bc_bufs[level] = NULL;
-         * Set up variables for this block as "left".
+        cur->bc_nlevels--;
-         */
-        lbp = cur->bc_bufs[level];
+        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
-        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                return error;
-#endif
-        /*
-         * If we've got no right sibling then we can't shift an entry right.
-         */
-        if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * If the cursor entry is the one that would be moved, don't
-         * do it... it's too complicated.
-         */
-        if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Set up the right neighbor as "right".
-         */
-        if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                        cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                        0, &rbp, XFS_INO_BTREE_REF)))
-                return error;
-        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-        if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                return error;
-        /*
-         * If it's full, it can't take another entry.
-         */
-        if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Make a hole at the start of the right neighbor block, then
-         * copy the last left block entry to the hole.
-         */
-        if (level > 0) {
-                lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                                return error;
-                }
-#endif
-                memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
-                        return error;
-#endif
-                *rkp = *lkp;
-                *rpp = *lpp;
-                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-        } else {
-                lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                *rrp = *lrp;
-                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-                key.ir_startino = rrp->ir_startino;
-                rkp = &key;
-        }
-        /*
-         * Decrement and log left's numrecs, bump and log right's numrecs.
-         */
-        be16_add_cpu(&left->bb_numrecs, -1);
-        xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-        be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-        if (level > 0)
-                xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
-        else
-                xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-#endif
-        xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-        /*
-         * Using a temporary cursor, update the parent key values of the
-         * block on the right.
-         */
-        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                return error;
-        xfs_btree_lastrec(tcur, level);
-        if ((error = xfs_inobt_increment(tcur, level, &i)) ||
-            (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
-                xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-                return error;
-        }
-        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-        *stat = 1;
        return 0;
 }
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                              /* error */
-xfs_inobt_split(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        int                     level,  /* level to split */
-        xfs_agblock_t           *bnop,  /* output: block number allocated */
-        xfs_inobt_key_t         *keyp,  /* output: first key of new block */
-        xfs_btree_cur_t         **curp, /* output: new cursor */
-        int                     *stat)  /* success/failure */
-{
-        xfs_alloc_arg_t         args;   /* allocation argument structure */
-        int                     error;  /* error return value */
-        int                     i;      /* loop index/record number */
-        xfs_agblock_t           lbno;   /* left (current) block number */
-        xfs_buf_t               *lbp;   /* buffer for left block */
-        xfs_inobt_block_t       *left;  /* left (current) btree block */
-        xfs_inobt_key_t         *lkp;   /* left btree key pointer */
-        xfs_inobt_ptr_t         *lpp;   /* left btree address pointer */
-        xfs_inobt_rec_t         *lrp;   /* left btree record pointer */
-        xfs_buf_t               *rbp;   /* buffer for right block */
-        xfs_inobt_block_t       *right; /* right (new) btree block */
-        xfs_inobt_key_t         *rkp;   /* right btree key pointer */
-        xfs_inobt_ptr_t         *rpp;   /* right btree address pointer */
-        xfs_inobt_rec_t         *rrp;   /* right btree record pointer */
-        /*
-         * Set up left block (current one).
-         */
-        lbp = cur->bc_bufs[level];
-        args.tp = cur->bc_tp;
-        args.mp = cur->bc_mp;
-        lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-        /*
-         * Allocate the new block.
-         * If we can't do it, we're toast.  Give up.
-         */
-        args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
-        args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-                args.isfl = args.userdata = args.minalignslop = 0;
-        args.minlen = args.maxlen = args.prod = 1;
-        args.type = XFS_ALLOCTYPE_NEAR_BNO;
-        if ((error = xfs_alloc_vextent(&args)))
-                return error;
-        if (args.fsbno == NULLFSBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        ASSERT(args.len == 1);
-        rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-        /*
-         * Set up the new block as "right".
-         */
-        right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-        /*
-         * "Left" is the current (according to the cursor) block.
-         */
-        left = XFS_BUF_TO_INOBT_BLOCK(lbp);
 #ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
+STATIC int
-                return error;
+xfs_inobt_keys_inorder(
-#endif
+        struct xfs_btree_cur    *cur,
-        /*
+        union xfs_btree_key     *k1,
-         * Fill in the btree header for the new block.
+        union xfs_btree_key     *k2)
-         */
+{
-        right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+        return be32_to_cpu(k1->inobt.ir_startino) <
-        right->bb_level = left->bb_level;
+                be32_to_cpu(k2->inobt.ir_startino);
-        right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-        /*
-         * Make sure that if there's an odd number of entries now, that
-         * each new block will have the same number of entries.
-         */
-        if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-            cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-                be16_add_cpu(&right->bb_numrecs, 1);
-        i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-        /*
-         * For non-leaf blocks, copy keys and addresses over to the new block.
-         */
-        if (level > 0) {
-                lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
-                lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
-                rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-                rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-                for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                        if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                                return error;
-                }
-#endif
-                memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-                memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-                xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                *keyp = *rkp;
-        }
-        /*
-         * For leaf blocks, copy records over to the new block.
-         */
-        else {
-                lrp = XFS_INOBT_REC_ADDR(left, i, cur);
-                rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-                memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-                xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-                keyp->ir_startino = rrp->ir_startino;
-        }
-        /*
-         * Find the left block number by looking in the buffer.
-         * Adjust numrecs, sibling pointers.
-         */
-        be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-        right->bb_rightsib = left->bb_rightsib;
-        left->bb_rightsib = cpu_to_be32(args.agbno);
-        right->bb_leftsib = cpu_to_be32(lbno);
-        xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
-        xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-        /*
-         * If there's a block to the new block's right, make that block
-         * point back to right instead of to left.
-         */
-        if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-                xfs_inobt_block_t       *rrblock;       /* rr btree block */
-                xfs_buf_t               *rrbp;          /* buffer for rrblock */
-                if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                                be32_to_cpu(right->bb_rightsib), 0, &rrbp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
-                if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                        return error;
-                rrblock->bb_leftsib = cpu_to_be32(args.agbno);
-                xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
-        }
-        /*
-         * If the cursor is really in the right block, move it there.
-         * If it's just pointing past the last entry in left, then we'll
-         * insert there, so don't change anything in that case.
-         */
-        if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-                xfs_btree_setbuf(cur, level, rbp);
-                cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-        }
-        /*
-         * If there are more levels, we'll need another cursor which refers
-         * the right block, no matter where this cursor was.
-         */
-        if (level + 1 < cur->bc_nlevels) {
-                if ((error = xfs_btree_dup_cursor(cur, curp)))
-                        return error;
-                (*curp)->bc_ptrs[level + 1]++;
-        }
-        *bnop = args.agbno;
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC int
- * Update keys at all levels from here to the root along the cursor's path.
+xfs_inobt_recs_inorder(
- */
+        struct xfs_btree_cur    *cur,
-STATIC int                              /* error */
+        union xfs_btree_rec     *r1,
-xfs_inobt_updkey(
+        union xfs_btree_rec     *r2)
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_inobt_key_t         *keyp,  /* new key value to update to */
-        int                     level)  /* starting level for update */
 {
-        int                     ptr;    /* index of key in block */
+        return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+                be32_to_cpu(r2->inobt.ir_startino);
-        /*
-         * Go up the tree from this level toward the root.
-         * At each level, update the key value to the value input.
-         * Stop when we reach a level where the cursor isn't pointing
-         * at the first entry in the block.
-         */
-        for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-                xfs_buf_t               *bp;    /* buffer for block */
-                xfs_inobt_block_t       *block; /* btree block */
-#ifdef DEBUG
-                int                     error;  /* error return value */
-#endif
-                xfs_inobt_key_t         *kp;    /* ptr to btree block keys */
-                bp = cur->bc_bufs[level];
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                        return error;
-#endif
-                ptr = cur->bc_ptrs[level];
-                kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-                *kp = *keyp;
-                xfs_inobt_log_keys(cur, bp, ptr, ptr);
-        }
-        return 0;
 }
+#endif  /* DEBUG */
-/*
+#ifdef XFS_BTREE_TRACE
- * Externally visible routines.
+ktrace_t        *xfs_inobt_trace_buf;
- */
-/*
+STATIC void
- * Decrement cursor by one record at the level.
+xfs_inobt_trace_enter(
- * For nonzero levels the leaf-ward information is untouched.
+        struct xfs_btree_cur    *cur,
- */
+        const char              *func,
-int                                     /* error */
+        char                    *s,
-xfs_inobt_decrement(
+        int                     type,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        int                     line,
-        int                     level,  /* level in btree, 0 is leaf */
+        __psunsigned_t          a0,
-        int                     *stat)  /* success/failure */
+        __psunsigned_t          a1,
+        __psunsigned_t          a2,
+        __psunsigned_t          a3,
+        __psunsigned_t          a4,
+        __psunsigned_t          a5,
+        __psunsigned_t          a6,
+        __psunsigned_t          a7,
+        __psunsigned_t          a8,
+        __psunsigned_t          a9,
+        __psunsigned_t          a10)
 {
-        xfs_inobt_block_t       *block; /* btree block */
+        ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type,
-        int                     error;
+                (void *)func, (void *)s, NULL, (void *)cur,
-        int                     lev;    /* btree level */
+                (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+                (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-        ASSERT(level < cur->bc_nlevels);
+                (void *)a8, (void *)a9, (void *)a10);
-        /*
-         * Read-ahead to the left at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-        /*
-         * Decrement the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (--cur->bc_ptrs[level] > 0) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * Get a pointer to the btree block.
-         */
-        block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level,
-                        cur->bc_bufs[level])))
-                return error;
-#endif
-        /*
-         * If we just went off the left edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree decrementing pointers.
-         * Stop when we don't go off the left edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                if (--cur->bc_ptrs[lev] > 0)
-                        break;
-                /*
-                 * Read-ahead the left block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                xfs_buf_t       *bp;    /* buffer containing btree block */
-                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
-                                XFS_INO_BTREE_REF)))
-                        return error;
-                lev--;
-                xfs_btree_setbuf(cur, lev, bp);
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-        }
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Delete the record pointed to by cur.
+xfs_inobt_trace_cursor(
- * The cursor refers to the place where the record was (could be inserted)
+        struct xfs_btree_cur    *cur,
- * when the operation returns.
+        __uint32_t              *s0,
- */
+        __uint64_t              *l0,
-int                                     /* error */
+        __uint64_t              *l1)
-xfs_inobt_delete(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        int             *stat)          /* success/failure */
 {
-        int             error;
+        *s0 = cur->bc_private.a.agno;
-        int             i;              /* result code */
+        *l0 = cur->bc_rec.i.ir_startino;
-        int             level;          /* btree level */
+        *l1 = cur->bc_rec.i.ir_free;
-        /*
-         * Go up the tree, starting at leaf level.
-         * If 2 is returned then a join was done; go to the next level.
-         * Otherwise we are done.
-         */
-        for (level = 0, i = 2; i == 2; level++) {
-                if ((error = xfs_inobt_delrec(cur, level, &i)))
-                        return error;
-        }
-        if (i == 0) {
-                for (level = 1; level < cur->bc_nlevels; level++) {
-                        if (cur->bc_ptrs[level] == 0) {
-                                if ((error = xfs_inobt_decrement(cur, level, &i)))
-                                        return error;
-                                break;
-                        }
-                }
-        }
-        *stat = i;
-        return 0;
 }
+STATIC void
-/*
+xfs_inobt_trace_key(
- * Get the data from the pointed-to record.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_key     *key,
-int                                     /* error */
+        __uint64_t              *l0,
-xfs_inobt_get_rec(
+        __uint64_t              *l1)
-        xfs_btree_cur_t         *cur,   /* btree cursor */
-        xfs_agino_t             *ino,   /* output: starting inode of chunk */
-        __int32_t               *fcnt,  /* output: number of free inodes */
-        xfs_inofree_t           *free,  /* output: free inode mask */
-        int                     *stat)  /* output: success/failure */
 {
-        xfs_inobt_block_t       *block; /* btree block */
+        *l0 = be32_to_cpu(key->inobt.ir_startino);
-        xfs_buf_t               *bp;    /* buffer containing btree block */
+        *l1 = 0;
-#ifdef DEBUG
-        int                     error;  /* error return value */
-#endif
-        int                     ptr;    /* record number */
-        xfs_inobt_rec_t         *rec;   /* record data */
-        bp = cur->bc_bufs[0];
-        ptr = cur->bc_ptrs[0];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-                return error;
-#endif
-        /*
-         * Off the right end or left end, return failure.
-         */
-        if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * Point to the record and extract its data.
-         */
-        rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
-        *ino = be32_to_cpu(rec->ir_startino);
-        *fcnt = be32_to_cpu(rec->ir_freecount);
-        *free = be64_to_cpu(rec->ir_free);
-        *stat = 1;
-        return 0;
 }
-/*
+STATIC void
- * Increment cursor by one record at the level.
+xfs_inobt_trace_record(
- * For nonzero levels the leaf-ward information is untouched.
+        struct xfs_btree_cur    *cur,
- */
+        union xfs_btree_rec     *rec,
-int                                     /* error */
+        __uint64_t              *l0,
-xfs_inobt_increment(
+        __uint64_t              *l1,
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        __uint64_t              *l2)
-        int                     level,  /* level in btree, 0 is leaf */
-        int                     *stat)  /* success/failure */
 {
-        xfs_inobt_block_t       *block; /* btree block */
+        *l0 = be32_to_cpu(rec->inobt.ir_startino);
-        xfs_buf_t               *bp;    /* buffer containing btree block */
+        *l1 = be32_to_cpu(rec->inobt.ir_freecount);
-        int                     error;  /* error return value */
+        *l2 = be64_to_cpu(rec->inobt.ir_free);
-        int                     lev;    /* btree level */
+}
+#endif /* XFS_BTREE_TRACE */
+static const struct xfs_btree_ops xfs_inobt_ops = {
+        .rec_len                = sizeof(xfs_inobt_rec_t),
+        .key_len                = sizeof(xfs_inobt_key_t),
+        .dup_cursor             = xfs_inobt_dup_cursor,
+        .set_root               = xfs_inobt_set_root,
+        .kill_root              = xfs_inobt_kill_root,
+        .alloc_block            = xfs_inobt_alloc_block,
+        .free_block             = xfs_inobt_free_block,
+        .get_minrecs            = xfs_inobt_get_minrecs,
+        .get_maxrecs            = xfs_inobt_get_maxrecs,
+        .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+        .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+        .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
+        .key_diff               = xfs_inobt_key_diff,
-        ASSERT(level < cur->bc_nlevels);
-        /*
-         * Read-ahead to the right at this level.
-         */
-        xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-        /*
-         * Get a pointer to the btree block.
-         */
-        bp = cur->bc_bufs[level];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                return error;
-#endif
-        /*
-         * Increment the ptr at this level.  If we're still in the block
-         * then we're done.
-         */
-        if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-                *stat = 1;
-                return 0;
-        }
-        /*
-         * If we just went off the right edge of the tree, return failure.
-         */
-        if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-                *stat = 0;
-                return 0;
-        }
-        /*
-         * March up the tree incrementing pointers.
-         * Stop when we don't go off the right edge of a block.
-         */
-        for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-                bp = cur->bc_bufs[lev];
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
 #ifdef DEBUG
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
+        .keys_inorder           = xfs_inobt_keys_inorder,
-                        return error;
+        .recs_inorder           = xfs_inobt_recs_inorder,
 #endif
-                if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                        break;
-                /*
-                 * Read-ahead the right block, we're going to read it
-                 * in the next loop.
-                 */
-                xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-        }
-        /*
-         * If we went off the root then we are seriously confused.
-         */
-        ASSERT(lev < cur->bc_nlevels);
-        /*
-         * Now walk back down the tree, fixing up the cursor's buffer
-         * pointers and key numbers.
-         */
-        for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
-             lev > level; ) {
-                xfs_agblock_t   agbno;  /* block number of btree block */
-                agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
+#ifdef XFS_BTREE_TRACE
-                if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
+        .trace_enter            = xfs_inobt_trace_enter,
-                                cur->bc_private.a.agno, agbno, 0, &bp,
+        .trace_cursor           = xfs_inobt_trace_cursor,
-                                XFS_INO_BTREE_REF)))
+        .trace_key              = xfs_inobt_trace_key,
-                        return error;
+        .trace_record           = xfs_inobt_trace_record,
-                lev--;
+#endif
-                xfs_btree_setbuf(cur, lev, bp);
+};
-                block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                        return error;
-                cur->bc_ptrs[lev] = 1;
-        }
-        *stat = 1;
-        return 0;
-}
 /*
- * Insert the current record at the point referenced by cur.
+ * Allocate a new inode btree cursor.
- * The cursor may be inconsistent on return if splits have been done.
 */
-int                                     /* error */
+struct xfs_btree_cur *                          /* new inode btree cursor */
-xfs_inobt_insert(
+xfs_inobt_init_cursor(
-        xfs_btree_cur_t *cur,           /* btree cursor */
+        struct xfs_mount        *mp,            /* file system mount point */
-        int             *stat)          /* success/failure */
+        struct xfs_trans        *tp,            /* transaction pointer */
+        struct xfs_buf          *agbp,          /* buffer for agi structure */
+        xfs_agnumber_t          agno)           /* allocation group number */
 {
-        int             error;          /* error return value */
+        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
-        int             i;              /* result value, 0 for failure */
+        struct xfs_btree_cur    *cur;
-        int             level;          /* current level number in btree */
-        xfs_agblock_t   nbno;           /* new block number (split result) */
-        xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-        xfs_inobt_rec_t nrec;           /* record being inserted this level */
-        xfs_btree_cur_t *pcur;          /* previous level's cursor */
-        level = 0;
+        cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-        nbno = NULLAGBLOCK;
-        nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-        nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
-        nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
-        ncur = NULL;
-        pcur = cur;
-        /*
-         * Loop going up the tree, starting at the leaf level.
-         * Stop when we don't get a split block, that must mean that
-         * the insert is finished with this level.
-         */
-        do {
-                /*
-                 * Insert nrec/nbno into this level of the tree.
-                 * Note if we fail, nbno will be null.
-                 */
-                if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                                &i))) {
-                        if (pcur != cur)
-                                xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                        return error;
-                }
-                /*
-                 * See if the cursor we just used is trash.
-                 * Can't trash the caller's cursor, but otherwise we should
-                 * if ncur is a new cursor or we're about to be done.
-                 */
-                if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                        cur->bc_nlevels = pcur->bc_nlevels;
-                        xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-                }
-                /*
-                 * If we got a new cursor, switch to it.
-                 */
-                if (ncur) {
-                        pcur = ncur;
-                        ncur = NULL;
-                }
-        } while (nbno != NULLAGBLOCK);
-        *stat = i;
-        return 0;
-}
-/*
+        cur->bc_tp = tp;
- * Lookup the record equal to ino in the btree given by cur.
+        cur->bc_mp = mp;
- */
+        cur->bc_nlevels = be32_to_cpu(agi->agi_level);
-int                                     /* error */
+        cur->bc_btnum = XFS_BTNUM_INO;
-xfs_inobt_lookup_eq(
+        cur->bc_blocklog = mp->m_sb.sb_blocklog;
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agino_t     ino,            /* starting inode of chunk */
-        __int32_t       fcnt,           /* free inode count */
-        xfs_inofree_t   free,           /* free inode mask */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-/*
+        cur->bc_ops = &xfs_inobt_ops;
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_inobt_lookup_ge(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agino_t     ino,            /* starting inode of chunk */
-        __int32_t       fcnt,           /* free inode count */
-        xfs_inofree_t   free,           /* free inode mask */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-/*
+        cur->bc_private.a.agbp = agbp;
- * Lookup the first record less than or equal to ino
+        cur->bc_private.a.agno = agno;
- * in the btree given by cur.
- */
+        return cur;
-int                                     /* error */
-xfs_inobt_lookup_le(
-        xfs_btree_cur_t *cur,           /* btree cursor */
-        xfs_agino_t     ino,            /* starting inode of chunk */
-        __int32_t       fcnt,           /* free inode count */
-        xfs_inofree_t   free,           /* free inode mask */
-        int             *stat)          /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
 }
 /*
- * Update the record referred to by cur, to the value given
+ * Calculate number of records in an inobt btree block.
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
 */
-int                                     /* error */
+int
-xfs_inobt_update(
+xfs_inobt_maxrecs(
-        xfs_btree_cur_t         *cur,   /* btree cursor */
+        struct xfs_mount        *mp,
-        xfs_agino_t             ino,    /* starting inode of chunk */
+        int                     blocklen,
-        __int32_t               fcnt,   /* free inode count */
+        int                     leaf)
-        xfs_inofree_t           free)   /* free inode mask */
 {
-        xfs_inobt_block_t       *block; /* btree block to update */
+        blocklen -= XFS_INOBT_BLOCK_LEN(mp);
-        xfs_buf_t               *bp;    /* buffer containing btree block */
-        int                     error;  /* error return value */
-        int                     ptr;    /* current record number (updating) */
-        xfs_inobt_rec_t         *rp;    /* pointer to updated record */
-        /*
+        if (leaf)
-         * Pick up the current block.
+                return blocklen / sizeof(xfs_inobt_rec_t);
-         */
+        return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
-        bp = cur->bc_bufs[0];
-        block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-        if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-                return error;
-#endif
-        /*
-         * Get the address of the rec to be updated.
-         */
-        ptr = cur->bc_ptrs[0];
-        rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-        /*
-         * Fill in the new contents and log them.
-         */
-        rp->ir_startino = cpu_to_be32(ino);
-        rp->ir_freecount = cpu_to_be32(fcnt);
-        rp->ir_free = cpu_to_be64(free);
-        xfs_inobt_log_recs(cur, bp, ptr, ptr);
-        /*
-         * Updating first record in leaf. Pass new key value up to our parent.
-         */
-        if (ptr == 1) {
-                xfs_inobt_key_t key;    /* key containing [ino] */
-                key.ir_startino = cpu_to_be32(ino);
-                if ((error = xfs_inobt_updkey(cur, &key, 1)))
-                        return error;
-        }
-        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 8efc4a5b8b92..37e5dd01a577 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -24,7 +24,6 @@
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 /*
@@ -70,11 +69,6 @@ typedef struct xfs_inobt_key {
 /* btree pointer type */
 typedef __be32 xfs_inobt_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_inobt_block_t;
-#define XFS_BUF_TO_INOBT_BLOCK(bp)      ((xfs_inobt_block_t *)XFS_BUF_PTR(bp))
 /*
 * Bit manipulations for ir_free.
 */
@@ -85,14 +79,6 @@ typedef	struct xfs_btree_sblock xfs_inobt_block_t;
 #define XFS_INOBT_CLR_FREE(rp,i)        ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
 /*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0])
-#define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0])
-#define XFS_INOBT_IS_LAST_REC(cur)      \
-        ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs))
-/*
 * Maximum number of inode btree levels.
 */
 #define XFS_IN_MAXLEVELS(mp)            ((mp)->m_in_maxlevels)
@@ -104,75 +90,38 @@ typedef	struct xfs_btree_sblock xfs_inobt_block_t;
 #define XFS_PREALLOC_BLOCKS(mp)         ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
 /*
- * Record, key, and pointer address macros for btree blocks.
+ * Btree block header size depends on a superblock flag.
- */
+ *
-#define XFS_INOBT_REC_ADDR(bb,i,cur) \
+ * (not quite yet, but soon)
-        (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i))
-#define XFS_INOBT_KEY_ADDR(bb,i,cur) \
-        (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i))
-#define XFS_INOBT_PTR_ADDR(bb,i,cur) \
-        (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \
-                                i, XFS_INOBT_BLOCK_MAXRECS(1, cur)))
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat);
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
-                             __int32_t *fcnt, xfs_inofree_t *free, int *stat);
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat);
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat);
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                                __int32_t fcnt, xfs_inofree_t free, int *stat);
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                                __int32_t fcnt, xfs_inofree_t free, int *stat);
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
 */
-extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
+#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
-                                __int32_t fcnt, xfs_inofree_t free, int *stat);
 /*
- * Update the record referred to by cur, to the value given
+ * Record, key, and pointer address macros for btree blocks.
- * by [ino, fcnt, free].
+ *
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * (note that some of these may appear unused, but they are used in userspace)
- */
+ */
-extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino,
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
-                                __int32_t fcnt, xfs_inofree_t free);
+        ((xfs_inobt_rec_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+        ((xfs_inobt_key_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 ((index) - 1) * sizeof(xfs_inobt_key_t)))
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+        ((xfs_inobt_ptr_t *) \
+                ((char *)(block) + \
+                 XFS_INOBT_BLOCK_LEN(mp) + \
+                 (maxrecs) * sizeof(xfs_inobt_key_t) + \
+                 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+                struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 #endif  /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e229e9e001c2..e2fb6210d4c5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -38,281 +38,283 @@
 #include "xfs_ialloc.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_btree_trace.h"
+#include "xfs_dir2_trace.h"
 /*
- * Look up an inode by number in the given file system.
+ * Allocate and initialise an xfs_inode.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, attach it to the provided
- * vnode.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and attach the provided vnode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *               for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- *        if known (as by bulkstat), else 0.
 */
-STATIC int
+STATIC struct xfs_inode *
-xfs_iget_core(
+xfs_inode_alloc(
-        struct inode    *inode,
+        struct xfs_mount        *mp,
-        xfs_mount_t     *mp,
+        xfs_ino_t               ino)
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        uint            flags,
-        uint            lock_flags,
-        xfs_inode_t     **ipp,
-        xfs_daddr_t     bno)
 {
-        struct inode    *old_inode;
+        struct xfs_inode        *ip;
-        xfs_inode_t     *ip;
-        xfs_inode_t     *iq;
-        int             error;
-        unsigned long   first_index, mask;
-        xfs_perag_t     *pag;
-        xfs_agino_t     agino;
-        /* the radix tree exists only in inode capable AGs */
+        /*
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+         * if this didn't occur in transactions, we could use
-                return EINVAL;
+         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+         * code up to do this anyway.
+         */
+        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+        if (!ip)
+                return NULL;
-        /* get the perag structure and ensure that it's inode capable */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
-        pag = xfs_get_perag(mp, ino);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        if (!pag->pagi_inodeok)
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-                return EINVAL;
+        ASSERT(completion_done(&ip->i_flush));
-        ASSERT(pag->pag_ici_init);
-        agino = XFS_INO_TO_AGINO(mp, ino);
-again:
+        /*
-        read_lock(&pag->pag_ici_lock);
+         * initialise the VFS inode here to get failures
-        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+         * out of the way early.
+         */
+        if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return NULL;
+        }
+        /* initialise the xfs inode */
+        ip->i_ino = ino;
+        ip->i_mount = mp;
+        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+        ip->i_afp = NULL;
+        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+        ip->i_flags = 0;
+        ip->i_update_core = 0;
+        ip->i_update_size = 0;
+        ip->i_delayed_blks = 0;
+        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+        ip->i_size = 0;
+        ip->i_new_size = 0;
+        /*
+         * Initialize inode's trace buffers.
+         */
+#ifdef  XFS_INODE_TRACE
+        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_RW_TRACE
+        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
+#endif
+        return ip;
+}
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+        struct xfs_perag        *pag,
+        struct xfs_inode        *ip,
+        int                     flags,
+        int                     lock_flags) __releases(pag->pag_ici_lock)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     error = EAGAIN;
+        /*
+         * If INEW is set this inode is being set up
+         * If IRECLAIM is set this inode is being torn down
+         * Pause and try again.
+         */
+        if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+                XFS_STATS_INC(xs_ig_frecycle);
+                goto out_error;
+        }
+        /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
+        if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
-        if (ip != NULL) {
                /*
-                 * If INEW is set this inode is being set up
+                 * If lookup is racing with unlink, then we should return an
-                 * we need to pause and try again.
+                 * error immediately so we don't remove it from the reclaim
+                 * list and potentially leak the inode.
                 */
-                if (xfs_iflags_test(ip, XFS_INEW)) {
+                if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                        read_unlock(&pag->pag_ici_lock);
+                        error = ENOENT;
-                        delay(1);
+                        goto out_error;
-                        XFS_STATS_INC(xs_ig_frecycle);
-                        goto again;
                }
-                old_inode = ip->i_vnode;
+                xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
-                if (old_inode == NULL) {
-                        /*
-                         * If IRECLAIM is set this inode is
-                         * on its way out of the system,
-                         * we need to pause and try again.
-                         */
-                        if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                delay(1);
-                                XFS_STATS_INC(xs_ig_frecycle);
-                                goto again;
-                        }
-                        ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-                        /*
-                         * If lookup is racing with unlink, then we
-                         * should return an error immediately so we
-                         * don't remove it from the reclaim list and
-                         * potentially leak the inode.
-                         */
-                        if ((ip->i_d.di_mode == 0) &&
-                            !(flags & XFS_IGET_CREATE)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                xfs_put_perag(mp, pag);
-                                return ENOENT;
-                        }
-                        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
-                        XFS_STATS_INC(xs_ig_found);
-                        xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-                        read_unlock(&pag->pag_ici_lock);
-                        XFS_MOUNT_ILOCK(mp);
-                        list_del_init(&ip->i_reclaim);
-                        XFS_MOUNT_IUNLOCK(mp);
-                        goto finish_inode;
-                } else if (inode != old_inode) {
-                        /* The inode is being torn down, pause and
-                         * try again.
-                         */
-                        if (old_inode->i_state & (I_FREEING | I_CLEAR)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                delay(1);
-                                XFS_STATS_INC(xs_ig_frecycle);
-                                goto again;
-                        }
-/* Chances are the other vnode (the one in the inode) is being torn
-* down right now, and we landed on top of it. Question is, what do
-* we do? Unhook the old inode and hook up the new one?
-*/
-                        cmn_err(CE_PANIC,
-                "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
-                                        old_inode, inode);
-                }
                /*
-                 * Inode cache hit
+                 * We need to re-initialise the VFS inode as it has been
+                 * 'freed' by the VFS. Do this here so we can deal with
+                 * errors cleanly, then tag it so it can be set up correctly
+                 * later.
                 */
-                read_unlock(&pag->pag_ici_lock);
+                if (!inode_init_always(mp->m_super, VFS_I(ip))) {
-                XFS_STATS_INC(xs_ig_found);
+                        error = ENOMEM;
+                        goto out_error;
-finish_inode:
-                if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                        xfs_put_perag(mp, pag);
-                        return ENOENT;
                }
-                if (lock_flags != 0)
+                /*
-                        xfs_ilock(ip, lock_flags);
+                 * We must set the XFS_INEW flag before clearing the
+                 * XFS_IRECLAIMABLE flag so that if a racing lookup does
+                 * not find the XFS_IRECLAIMABLE above but has the igrab()
+                 * below succeed we can safely check XFS_INEW to detect
+                 * that this inode is still being initialised.
+                 */
+                xfs_iflags_set(ip, XFS_INEW);
+                xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+                /* clear the radix tree reclaim flag as well. */
+                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+        } else if (!igrab(VFS_I(ip))) {
+                /* If the VFS inode is being torn down, pause and try again. */
+                XFS_STATS_INC(xs_ig_frecycle);
+                goto out_error;
+        } else if (xfs_iflags_test(ip, XFS_INEW)) {
+                /*
+                 * We are racing with another cache hit that is
+                 * currently recycling this inode out of the XFS_IRECLAIMABLE
+                 * state. Wait for the initialisation to complete before
+                 * continuing.
+                 */
+                wait_on_inode(VFS_I(ip));
+        }
-                xfs_iflags_clear(ip, XFS_ISTALE);
+        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                xfs_itrace_exit_tag(ip, "xfs_iget.found");
+                error = ENOENT;
-                goto return_ip;
+                iput(VFS_I(ip));
+                goto out_error;
        }
-        /*
+        /* We've got a live one. */
-         * Inode cache miss
-         */
        read_unlock(&pag->pag_ici_lock);
-        XFS_STATS_INC(xs_ig_missed);
-        /*
+        if (lock_flags != 0)
-         * Read the disk inode attributes into a new inode structure and get
+                xfs_ilock(ip, lock_flags);
-         * a new vnode for it. This should also initialize i_ino and i_mount.
-         */
-        error = xfs_iread(mp, tp, ino, &ip, bno,
-                          (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
-        if (error) {
-                xfs_put_perag(mp, pag);
-                return error;
-        }
-        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+        xfs_iflags_clear(ip, XFS_ISTALE);
+        xfs_itrace_exit_tag(ip, "xfs_iget.found");
+        XFS_STATS_INC(xs_ig_found);
+        return 0;
+out_error:
+        read_unlock(&pag->pag_ici_lock);
+        return error;
+}
-        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+static int
-                     "xfsino", ip->i_ino);
+xfs_iget_cache_miss(
-        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        struct xfs_mount        *mp,
-        init_waitqueue_head(&ip->i_ipin_wait);
+        struct xfs_perag        *pag,
-        atomic_set(&ip->i_pincount, 0);
+        xfs_trans_t             *tp,
+        xfs_ino_t               ino,
+        struct xfs_inode        **ipp,
+        xfs_daddr_t             bno,
+        int                     flags,
+        int                     lock_flags) __releases(pag->pag_ici_lock)
+{
+        struct xfs_inode        *ip;
+        int                     error;
+        unsigned long           first_index, mask;
+        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
-        /*
+        ip = xfs_inode_alloc(mp, ino);
-         * Because we want to use a counting completion, complete
+        if (!ip)
-         * the flush completion once to allow a single access to
+                return ENOMEM;
-         * the flush completion without blocking.
-         */
-        init_completion(&ip->i_flush);
-        complete(&ip->i_flush);
-        if (lock_flags)
+        error = xfs_iread(mp, tp, ip, bno, flags);
-                xfs_ilock(ip, lock_flags);
+        if (error)
+                goto out_destroy;
+        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                xfs_idestroy(ip);
+                error = ENOENT;
-                xfs_put_perag(mp, pag);
+                goto out_destroy;
-                return ENOENT;
        }
+        if (lock_flags)
+                xfs_ilock(ip, lock_flags);
        /*
         * Preload the radix tree so we can insert safely under the
-         * write spinlock.
+         * write spinlock. Note that we cannot sleep inside the preload
+         * region.
         */
        if (radix_tree_preload(GFP_KERNEL)) {
-                xfs_idestroy(ip);
+                error = EAGAIN;
-                delay(1);
+                goto out_unlock;
-                goto again;
        }
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = agino & mask;
        write_lock(&pag->pag_ici_lock);
-        /*
-         * insert the new inode
+        /* insert the new inode */
-         */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
        if (unlikely(error)) {
-                BUG_ON(error != -EEXIST);
+                WARN_ON(error != -EEXIST);
-                write_unlock(&pag->pag_ici_lock);
-                radix_tree_preload_end();
-                xfs_idestroy(ip);
                XFS_STATS_INC(xs_ig_dup);
-                goto again;
+                error = EAGAIN;
+                goto out_preload_end;
        }
-        /*
+        /* These values _must_ be set before releasing the radix tree lock! */
-         * These values _must_ be set before releasing the radix tree lock!
-         */
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-        /*
-         * Link ip to its mount and thread it on the mount's inode list.
-         */
-        XFS_MOUNT_ILOCK(mp);
-        if ((iq = mp->m_inodes)) {
-                ASSERT(iq->i_mprev->i_mnext == iq);
-                ip->i_mprev = iq->i_mprev;
-                iq->i_mprev->i_mnext = ip;
-                iq->i_mprev = ip;
-                ip->i_mnext = iq;
-        } else {
-                ip->i_mnext = ip;
-                ip->i_mprev = ip;
-        }
-        mp->m_inodes = ip;
-        XFS_MOUNT_IUNLOCK(mp);
-        xfs_put_perag(mp, pag);
- return_ip:
-        ASSERT(ip->i_df.if_ext_max ==
-               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
-        xfs_iflags_set(ip, XFS_IMODIFIED);
        *ipp = ip;
-        /*
-         * Set up the Linux with the Linux inode.
-         */
-        ip->i_vnode = inode;
-        inode->i_private = ip;
-        /*
-         * If we have a real type for an on-disk inode, we can set ops(&unlock)
-         * now.  If it's a new inode being created, xfs_ialloc will handle it.
-         */
-        if (ip->i_d.di_mode != 0)
-                xfs_setup_inode(ip);
        return 0;
-}
+out_preload_end:
+        write_unlock(&pag->pag_ici_lock);
+        radix_tree_preload_end();
+out_unlock:
+        if (lock_flags)
+                xfs_iunlock(ip, lock_flags);
+out_destroy:
+        xfs_destroy_inode(ip);
+        return error;
+}
 /*
- * The 'normal' internal xfs_iget, if needed it will
+ * Look up an inode by number in the given file system.
- * 'allocate', or 'get', the vnode.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *               for xfs_ilock() for a list of valid values.
+ * bno -- the block number starting the buffer containing the inode,
+ *        if known (as by bulkstat), else 0.
 */
 int
 xfs_iget(
@@ -324,61 +326,64 @@ xfs_iget(
        xfs_inode_t     **ipp,
        xfs_daddr_t     bno)
 {
-        struct inode    *inode;
        xfs_inode_t     *ip;
        int             error;
+        xfs_perag_t     *pag;
+        xfs_agino_t     agino;
-        XFS_STATS_INC(xs_ig_attempts);
+        /* the radix tree exists only in inode capable AGs */
+        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+                return EINVAL;
-retry:
+        /* get the perag structure and ensure that it's inode capable */
-        inode = iget_locked(mp->m_super, ino);
+        pag = xfs_get_perag(mp, ino);
-        if (!inode)
+        if (!pag->pagi_inodeok)
-                /* If we got no inode we are out of memory */
+                return EINVAL;
-                return ENOMEM;
+        ASSERT(pag->pag_ici_init);
+        agino = XFS_INO_TO_AGINO(mp, ino);
-        if (inode->i_state & I_NEW) {
+again:
-                XFS_STATS_INC(vn_active);
+        error = 0;
-                XFS_STATS_INC(vn_alloc);
+        read_lock(&pag->pag_ici_lock);
+        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-                error = xfs_iget_core(inode, mp, tp, ino, flags,
-                                lock_flags, ipp, bno);
+        if (ip) {
-                if (error) {
+                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
-                        make_bad_inode(inode);
+                if (error)
-                        if (inode->i_state & I_NEW)
+                        goto out_error_or_again;
-                                unlock_new_inode(inode);
+        } else {
-                        iput(inode);
+                read_unlock(&pag->pag_ici_lock);
-                }
+                XFS_STATS_INC(xs_ig_missed);
-                return error;
+                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
+                                                        flags, lock_flags);
+                if (error)
+                        goto out_error_or_again;
        }
+        xfs_put_perag(mp, pag);
+        *ipp = ip;
+        ASSERT(ip->i_df.if_ext_max ==
+               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
        /*
-         * If the inode is not fully constructed due to
+         * If we have a real type for an on-disk inode, we can set ops(&unlock)
-         * filehandle mismatches wait for the inode to go
+         * now.  If it's a new inode being created, xfs_ialloc will handle it.
-         * away and try again.
-         *
-         * iget_locked will call __wait_on_freeing_inode
-         * to wait for the inode to go away.
         */
-        if (is_bad_inode(inode)) {
+        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-                iput(inode);
+                xfs_setup_inode(ip);
-                delay(1);
+        return 0;
-                goto retry;
-        }
-        ip = XFS_I(inode);
+out_error_or_again:
-        if (!ip) {
+        if (error == EAGAIN) {
-                iput(inode);
                delay(1);
-                goto retry;
+                goto again;
        }
+        xfs_put_perag(mp, pag);
-        if (lock_flags != 0)
+        return error;
-                xfs_ilock(ip, lock_flags);
-        XFS_STATS_INC(xs_ig_found);
-        *ipp = ip;
-        return 0;
 }
 /*
 * Look for the inode corresponding to the given ino in the hash table.
 * If it is there and its i_transp pointer matches tp, return it.
@@ -444,99 +449,109 @@ xfs_iput_new(
        IRELE(ip);
 }
 /*
- * This routine embodies the part of the reclaim code that pulls
+ * This is called free all the memory associated with an inode.
- * the inode from the inode hash table and the mount structure's
+ * It must free the inode itself and any buffers allocated for
- * inode list.
+ * if_extents/if_data and if_broot.  It must also free the lock
- * This should only be called from xfs_reclaim().
+ * associated with the inode.
+ *
+ * Note: because we don't initialise everything on reallocation out
+ * of the zone, we must ensure we nullify everything correctly before
+ * freeing the structure.
 */
 void
-xfs_ireclaim(xfs_inode_t *ip)
+xfs_ireclaim(
+        struct xfs_inode        *ip)
 {
-        /*
+        struct xfs_mount        *mp = ip->i_mount;
-         * Remove from old hash list and mount list.
+        struct xfs_perag        *pag;
-         */
-        XFS_STATS_INC(xs_ig_reclaims);
-        xfs_iextract(ip);
+        XFS_STATS_INC(xs_ig_reclaims);
-        /*
-         * Here we do a spurious inode lock in order to coordinate with
-         * xfs_sync().  This is because xfs_sync() references the inodes
-         * in the mount list without taking references on the corresponding
-         * vnodes.  We make that OK here by ensuring that we wait until
-         * the inode is unlocked in xfs_sync() before we go ahead and
-         * free it.  We get both the regular lock and the io lock because
-         * the xfs_sync() code may need to drop the regular one but will
-         * still hold the io lock.
-         */
-        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        /*
-         * Release dquots (and their references) if any. An inode may escape
-         * xfs_inactive and get here via vn_alloc->vn_reclaim path.
-         */
-        XFS_QM_DQDETACH(ip->i_mount, ip);
-        /*
-         * Pull our behavior descriptor from the vnode chain.
-         */
-        if (ip->i_vnode) {
-                ip->i_vnode->i_private = NULL;
-                ip->i_vnode = NULL;
-        }
        /*
-         * Free all memory associated with the inode.
+         * Remove the inode from the per-AG radix tree.  It doesn't matter
+         * if it was never added to it because radix_tree_delete can deal
+         * with that case just fine.
         */
-        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+        pag = xfs_get_perag(mp, ip->i_ino);
-        xfs_idestroy(ip);
-}
-/*
- * This routine removes an about-to-be-destroyed inode from
- * all of the lists in which it is located with the exception
- * of the behavior chain.
- */
-void
-xfs_iextract(
-        xfs_inode_t     *ip)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
-        xfs_inode_t     *iq;
        write_lock(&pag->pag_ici_lock);
        radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
        write_unlock(&pag->pag_ici_lock);
        xfs_put_perag(mp, pag);
        /*
-         * Remove from mount's inode list.
+         * Here we do an (almost) spurious inode lock in order to coordinate
+         * with inode cache radix tree lookups.  This is because the lookup
+         * can reference the inodes in the cache without taking references.
+         *
+         * We make that OK here by ensuring that we wait until the inode is
+         * unlocked after the lookup before we go ahead and free it.  We get
+         * both the ilock and the iolock because the code may need to drop the
+         * ilock one but will still hold the iolock.
         */
-        XFS_MOUNT_ILOCK(mp);
+        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
-        iq = ip->i_mnext;
-        iq->i_mprev = ip->i_mprev;
-        ip->i_mprev->i_mnext = iq;
        /*
-         * Fix up the head pointer if it points to the inode being deleted.
+         * Release dquots (and their references) if any.
         */
-        if (mp->m_inodes == ip) {
+        XFS_QM_DQDETACH(ip->i_mount, ip);
-                if (ip == iq) {
+        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-                        mp->m_inodes = NULL;
-                } else {
+        switch (ip->i_d.di_mode & S_IFMT) {
-                        mp->m_inodes = iq;
+        case S_IFREG:
-                }
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
        }
-        /* Deal with the deleted inodes list */
+        if (ip->i_afp)
-        list_del_init(&ip->i_reclaim);
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-        mp->m_ireclaims++;
+#ifdef XFS_INODE_TRACE
-        XFS_MOUNT_IUNLOCK(mp);
+        ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+        ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(ip->i_dir_trace);
+#endif
+        if (ip->i_itemp) {
+                /*
+                 * Only if we are shutting down the fs will we see an
+                 * inode still in the AIL. If it is there, we should remove
+                 * it to prevent a use-after-free from occurring.
+                 */
+                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
+                struct xfs_ail  *ailp = lip->li_ailp;
+                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
+                if (lip->li_flags & XFS_LI_IN_AIL) {
+                        spin_lock(&ailp->xa_lock);
+                        if (lip->li_flags & XFS_LI_IN_AIL)
+                                xfs_trans_ail_delete(ailp, lip);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                }
+                xfs_inode_item_destroy(ip);
+                ip->i_itemp = NULL;
+        }
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(completion_done(&ip->i_flush));
+        kmem_zone_free(xfs_inode_zone, ip);
 }
 /*
@@ -737,7 +752,7 @@ xfs_iunlock(
                 * it is in the AIL and anyone is waiting on it.  Don't do
                 * this if the caller has asked us not to.
                 */
-                xfs_trans_unlocked_item(ip->i_mount,
+                xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
                                        (xfs_log_item_t*)(ip->i_itemp));
        }
        xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
@@ -790,3 +805,51 @@ xfs_isilocked(
 }
 #endif
+#ifdef  XFS_INODE_TRACE
+#define KTRACE_ENTER(ip, vk, s, line, ra)                       \
+        ktrace_enter((ip)->i_trace,                             \
+/*  0 */                (void *)(__psint_t)(vk),                \
+/*  1 */                (void *)(s),                            \
+/*  2 */                (void *)(__psint_t) line,               \
+/*  3 */                (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
+/*  4 */                (void *)(ra),                           \
+/*  5 */                NULL,                                   \
+/*  6 */                (void *)(__psint_t)current_cpu(),       \
+/*  7 */                (void *)(__psint_t)current_pid(),       \
+/*  8 */                (void *)__return_address,               \
+/*  9 */                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+/*
+ * Vnode tracing code.
+ */
+void
+_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
+}
+void
+_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
+}
+void
+xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
+}
+void
+_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
+}
+void
+xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+        KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
+}
+#endif  /* XFS_INODE_TRACE */
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
deleted file mode 100644
index d36450003983..000000000000
--- a/fs/xfs/xfs_imap.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_IMAP_H__
-#define __XFS_IMAP_H__
-/*
- * This is the structure passed to xfs_imap() to map
- * an inode number to its on disk location.
- */
-typedef struct xfs_imap {
-        xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
-        uint            im_len;         /* length in BBs of inode chunk */
-        xfs_agblock_t   im_agblkno;     /* logical block of inode chunk in ag */
-        ushort          im_ioffset;     /* inode offset in block in "inodes" */
-        ushort          im_boffset;     /* inode offset in block in bytes */
-} xfs_imap_t;
-#ifdef __KERNEL__
-struct xfs_mount;
-struct xfs_trans;
-int     xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                 xfs_imap_t *, uint);
-#endif
-#endif  /* __XFS_IMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a391b955df01..5a5e035e5d38 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -23,7 +23,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_imap.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
 #include "xfs_sb.h"
@@ -41,6 +40,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
@@ -133,10 +133,10 @@ STATIC int
 xfs_imap_to_bp(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
-        xfs_imap_t      *imap,
+        struct xfs_imap *imap,
        xfs_buf_t       **bpp,
        uint            buf_flags,
-        uint            imap_flags)
+        uint            iget_flags)
 {
        int             error;
        int             i;
@@ -173,12 +173,12 @@ xfs_imap_to_bp(
                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
                                        (i << mp->m_sb.sb_inodelog));
-                di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
+                di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC &&
-                            XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+                            XFS_DINODE_GOOD_VERSION(dip->di_version);
                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
                                                XFS_ERRTAG_ITOBP_INOTOBP,
                                                XFS_RANDOM_ITOBP_INOTOBP))) {
-                        if (imap_flags & XFS_IMAP_BULKSTAT) {
+                        if (iget_flags & XFS_IGET_BULKSTAT) {
                                xfs_trans_brelse(tp, bp);
                                return XFS_ERROR(EINVAL);
                        }
@@ -190,7 +190,7 @@ xfs_imap_to_bp(
                                        "daddr %lld #%d (magic=%x)",
                                XFS_BUFTARG_NAME(mp->m_ddev_targp),
                                (unsigned long long)imap->im_blkno, i,
-                                be16_to_cpu(dip->di_core.di_magic));
+                                be16_to_cpu(dip->di_magic));
 #endif
                        xfs_trans_brelse(tp, bp);
                        return XFS_ERROR(EFSCORRUPTED);
@@ -221,25 +221,26 @@ xfs_imap_to_bp(
 * Use xfs_imap() to determine the size and location of the
 * buffer to read from disk.
 */
-STATIC int
+int
 xfs_inotobp(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
        xfs_ino_t       ino,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
-        int             *offset)
+        int             *offset,
+        uint            imap_flags)
 {
-        xfs_imap_t      imap;
+        struct xfs_imap imap;
        xfs_buf_t       *bp;
        int             error;
        imap.im_blkno = 0;
-        error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+        error = xfs_imap(mp, tp, ino, &imap, imap_flags);
        if (error)
                return error;
-        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+        error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
        if (error)
                return error;
@@ -260,15 +261,11 @@ xfs_inotobp(
 * If a non-zero error is returned, then the contents of bpp and
 * dipp are undefined.
 *
- * If the inode is new and has not yet been initialized, use xfs_imap()
+ * The inode is expected to already been mapped to its buffer and read
- * to determine the size and location of the buffer to read from disk.
+ * in once, thus we can use the mapping information stored in the inode
- * If the inode has already been mapped to its buffer and read in once,
+ * rather than calling xfs_imap().  This allows us to avoid the overhead
- * then use the mapping information stored in the inode rather than
+ * of looking at the inode btree for small block file systems
- * calling xfs_imap().  This allows us to avoid the overhead of looking
+ * (see xfs_imap()).
- * at the inode btree for small block file systems (see xfs_dilocate()).
- * We can tell whether the inode has been mapped in before by comparing
- * its disk block address to 0.  Only uninitialized inodes will have
- * 0 for the disk block address.
 */
 int
 xfs_itobp(
@@ -277,40 +274,14 @@ xfs_itobp(
        xfs_inode_t     *ip,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
-        xfs_daddr_t     bno,
-        uint            imap_flags,
        uint            buf_flags)
 {
-        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
-        if (ip->i_blkno == (xfs_daddr_t)0) {
+        ASSERT(ip->i_imap.im_blkno != 0);
-                imap.im_blkno = bno;
-                error = xfs_imap(mp, tp, ip->i_ino, &imap,
-                                        XFS_IMAP_LOOKUP | imap_flags);
-                if (error)
-                        return error;
-                /*
+        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
-                 * Fill in the fields in the inode that will be used to
-                 * map the inode to its buffer from now on.
-                 */
-                ip->i_blkno = imap.im_blkno;
-                ip->i_len = imap.im_len;
-                ip->i_boffset = imap.im_boffset;
-        } else {
-                /*
-                 * We've already mapped the inode once, so just use the
-                 * mapping that we saved the first time.
-                 */
-                imap.im_blkno = ip->i_blkno;
-                imap.im_len = ip->i_len;
-                imap.im_boffset = ip->i_boffset;
-        }
-        ASSERT(bno == 0 || bno == imap.im_blkno);
-        error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
        if (error)
                return error;
@@ -321,7 +292,7 @@ xfs_itobp(
                return EAGAIN;
        }
-        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+        *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
        *bpp = bp;
        return 0;
 }
@@ -348,26 +319,26 @@ xfs_iformat(
                XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        error = 0;
-        if (unlikely(be32_to_cpu(dip->di_core.di_nextents) +
+        if (unlikely(be32_to_cpu(dip->di_nextents) +
-                     be16_to_cpu(dip->di_core.di_anextents) >
+                     be16_to_cpu(dip->di_anextents) >
-                     be64_to_cpu(dip->di_core.di_nblocks))) {
+                     be64_to_cpu(dip->di_nblocks))) {
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                        "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
                        (unsigned long long)ip->i_ino,
-                        (int)(be32_to_cpu(dip->di_core.di_nextents) +
+                        (int)(be32_to_cpu(dip->di_nextents) +
-                              be16_to_cpu(dip->di_core.di_anextents)),
+                              be16_to_cpu(dip->di_anextents)),
                        (unsigned long long)
-                                be64_to_cpu(dip->di_core.di_nblocks));
+                                be64_to_cpu(dip->di_nblocks));
                XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
        }
-        if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+        if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                        "corrupt dinode %Lu, forkoff = 0x%x.",
                        (unsigned long long)ip->i_ino,
-                        dip->di_core.di_forkoff);
+                        dip->di_forkoff);
                XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
                                     ip->i_mount, dip);
                return XFS_ERROR(EFSCORRUPTED);
@@ -378,25 +349,25 @@ xfs_iformat(
        case S_IFCHR:
        case S_IFBLK:
        case S_IFSOCK:
-                if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) {
+                if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
                        XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
                                              ip->i_mount, dip);
                        return XFS_ERROR(EFSCORRUPTED);
                }
                ip->i_d.di_size = 0;
                ip->i_size = 0;
-                ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev);
+                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
                break;
        case S_IFREG:
        case S_IFLNK:
        case S_IFDIR:
-                switch (dip->di_core.di_format) {
+                switch (dip->di_format) {
                case XFS_DINODE_FMT_LOCAL:
                        /*
                         * no local regular files yet
                         */
-                        if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) {
+                        if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                                        "corrupt inode %Lu "
                                        "(local format for regular file).",
@@ -407,7 +378,7 @@ xfs_iformat(
                                return XFS_ERROR(EFSCORRUPTED);
                        }
-                        di_size = be64_to_cpu(dip->di_core.di_size);
+                        di_size = be64_to_cpu(dip->di_size);
                        if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
                                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
                                        "corrupt inode %Lu "
@@ -449,7 +420,7 @@ xfs_iformat(
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
        ip->i_afp->if_ext_max =
                XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
-        switch (dip->di_core.di_aformat) {
+        switch (dip->di_aformat) {
        case XFS_DINODE_FMT_LOCAL:
                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
                size = be16_to_cpu(atp->hdr.totsize);
@@ -621,7 +592,7 @@ xfs_iformat_btree(
        ifp = XFS_IFORK_PTR(ip, whichfork);
        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
        size = XFS_BMAP_BROOT_SPACE(dfp);
-        nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
+        nrecs = be16_to_cpu(dfp->bb_numrecs);
        /*
         * blow out if -- fork has less extents than can fit in
@@ -649,8 +620,9 @@ xfs_iformat_btree(
         * Copy and convert from the on-disk structure
         * to the in-memory structure.
         */
-        xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+        xfs_bmdr_to_bmbt(ip->i_mount, dfp,
-                ifp->if_broot, size);
+                         XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                         ifp->if_broot, size);
        ifp->if_flags &= ~XFS_IFEXTENTS;
        ifp->if_flags |= XFS_IFBROOT;
@@ -660,7 +632,7 @@ xfs_iformat_btree(
 void
 xfs_dinode_from_disk(
        xfs_icdinode_t          *to,
-        xfs_dinode_core_t       *from)
+        xfs_dinode_t            *from)
 {
        to->di_magic = be16_to_cpu(from->di_magic);
        to->di_mode = be16_to_cpu(from->di_mode);
@@ -694,7 +666,7 @@ xfs_dinode_from_disk(
 void
 xfs_dinode_to_disk(
-        xfs_dinode_core_t       *to,
+        xfs_dinode_t            *to,
        xfs_icdinode_t          *from)
 {
        to->di_magic = cpu_to_be16(from->di_magic);
@@ -781,93 +753,57 @@ uint
 xfs_dic2xflags(
        xfs_dinode_t            *dip)
 {
-        xfs_dinode_core_t       *dic = &dip->di_core;
+        return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
-        return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) |
                                (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 }
 /*
- * Given a mount structure and an inode number, return a pointer
+ * Read the disk inode attributes into the in-core inode structure.
- * to a newly allocated in-core inode corresponding to the given
- * inode number.
- *
- * Initialize the inode's attributes and extent pointers if it
- * already has them (it will not if the inode has no links).
 */
 int
 xfs_iread(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
+        xfs_inode_t     *ip,
-        xfs_inode_t     **ipp,
        xfs_daddr_t     bno,
-        uint            imap_flags)
+        uint            iget_flags)
 {
        xfs_buf_t       *bp;
        xfs_dinode_t    *dip;
-        xfs_inode_t     *ip;
        int             error;
-        ASSERT(xfs_inode_zone != NULL);
-        ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
-        ip->i_ino = ino;
-        ip->i_mount = mp;
-        atomic_set(&ip->i_iocount, 0);
-        spin_lock_init(&ip->i_flags_lock);
        /*
-         * Get pointer's to the on-disk inode and the buffer containing it.
+         * Fill in the location information in the in-core inode.
-         * If the inode number refers to a block outside the file system
-         * then xfs_itobp() will return NULL.  In this case we should
-         * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
-         * know that this is a new incore inode.
         */
-        error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
+        ip->i_imap.im_blkno = bno;
-        if (error) {
+        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
-                kmem_zone_free(xfs_inode_zone, ip);
+        if (error)
                return error;
-        }
+        ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
        /*
-         * Initialize inode's trace buffers.
+         * Get pointers to the on-disk inode and the buffer containing it.
-         * Do this before xfs_iformat in case it adds entries.
         */
-#ifdef  XFS_INODE_TRACE
+        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
-        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
+                               XFS_BUF_LOCK, iget_flags);
-#endif
+        if (error)
-#ifdef XFS_BMAP_TRACE
+                return error;
-        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
-#endif
-#ifdef XFS_BMBT_TRACE
-        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_RW_TRACE
-        ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
-#endif
        /*
         * If we got something that isn't an inode it means someone
         * (nfs or dmi) has a stale handle.
         */
-        if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) {
+        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
-                kmem_zone_free(xfs_inode_zone, ip);
-                xfs_trans_brelse(tp, bp);
 #ifdef DEBUG
                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
-                                "dip->di_core.di_magic (0x%x) != "
+                                "dip->di_magic (0x%x) != "
                                "XFS_DINODE_MAGIC (0x%x)",
-                                be16_to_cpu(dip->di_core.di_magic),
+                                be16_to_cpu(dip->di_magic),
                                XFS_DINODE_MAGIC);
 #endif /* DEBUG */
-                return XFS_ERROR(EINVAL);
+                error = XFS_ERROR(EINVAL);
+                goto out_brelse;
        }
        /*
@@ -877,24 +813,22 @@ xfs_iread(
         * specific information.
         * Otherwise, just get the truly permanent information.
         */
-        if (dip->di_core.di_mode) {
+        if (dip->di_mode) {
-                xfs_dinode_from_disk(&ip->i_d, &dip->di_core);
+                xfs_dinode_from_disk(&ip->i_d, dip);
                error = xfs_iformat(ip, dip);
                if (error)  {
-                        kmem_zone_free(xfs_inode_zone, ip);
-                        xfs_trans_brelse(tp, bp);
 #ifdef DEBUG
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
                                        "xfs_iformat() returned error %d",
                                        error);
 #endif /* DEBUG */
-                        return error;
+                        goto out_brelse;
                }
        } else {
-                ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic);
+                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
-                ip->i_d.di_version = dip->di_core.di_version;
+                ip->i_d.di_version = dip->di_version;
-                ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen);
+                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
-                ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter);
+                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
                /*
                 * Make sure to pull in the mode here as well in
                 * case the inode is released without being used.
@@ -911,8 +845,6 @@ xfs_iread(
                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        }
-        INIT_LIST_HEAD(&ip->i_reclaim);
        /*
         * The inode format changed when we moved the link count and
         * made it 32 bits long.  If this is an old format inode,
@@ -924,7 +856,7 @@ xfs_iread(
         * the new format. We don't change the version number so that we
         * can distinguish this from a real new format inode.
         */
-        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+        if (ip->i_d.di_version == 1) {
                ip->i_d.di_nlink = ip->i_d.di_onlink;
                ip->i_d.di_onlink = 0;
                ip->i_d.di_projid = 0;
@@ -938,7 +870,7 @@ xfs_iread(
         * around for a while.  This helps to keep recently accessed
         * meta-data in-core longer.
         */
-         XFS_BUF_SET_REF(bp, XFS_INO_REF);
+        XFS_BUF_SET_REF(bp, XFS_INO_REF);
        /*
         * Use xfs_trans_brelse() to release the buffer containing the
@@ -953,9 +885,9 @@ xfs_iread(
         * to worry about the inode being changed just because we released
         * the buffer.
         */
+ out_brelse:
        xfs_trans_brelse(tp, bp);
-        *ipp = ip;
+        return error;
-        return 0;
 }
 /*
@@ -1049,6 +981,7 @@ xfs_ialloc(
        uint            flags;
        int             error;
        timespec_t      tv;
+        int             filestreams = 0;
        /*
         * Call the space management code to pick
@@ -1056,9 +989,8 @@ xfs_ialloc(
         */
        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
                            ialloc_context, call_again, &ino);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        if (*call_again || ino == NULLFSINO) {
                *ipp = NULL;
                return 0;
@@ -1072,9 +1004,8 @@ xfs_ialloc(
         */
        error = xfs_trans_iget(tp->t_mountp, tp, ino,
                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
-        if (error != 0) {
+        if (error)
                return error;
-        }
        ASSERT(ip != NULL);
        ip->i_d.di_mode = (__uint16_t)mode;
@@ -1093,8 +1024,8 @@ xfs_ialloc(
         * here rather than here and in the flush/logging code.
         */
        if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
-            ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+            ip->i_d.di_version == 1) {
-                ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                ip->i_d.di_version = 2;
                /*
                 * We've already zeroed the old link count, the projid field,
                 * and the pad field.
@@ -1104,7 +1035,7 @@ xfs_ialloc(
        /*
         * Project ids won't be stored on disk if we are using a version 1 inode.
         */
-        if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
+        if ((prid != 0) && (ip->i_d.di_version == 1))
                xfs_bump_ino_vers2(tp, ip);
        if (pip && XFS_INHERIT_GID(pip)) {
@@ -1155,13 +1086,12 @@ xfs_ialloc(
                flags |= XFS_ILOG_DEV;
                break;
        case S_IFREG:
-                if (pip && xfs_inode_is_filestream(pip)) {
+                /*
-                        error = xfs_filestream_associate(pip, ip);
+                 * we can't set up filestreams until after the VFS inode
-                        if (error < 0)
+                 * is set up properly.
-                                return -error;
+                 */
-                        if (!error)
+                if (pip && xfs_inode_is_filestream(pip))
-                                xfs_iflags_set(ip, XFS_IFILESTREAM);
+                        filestreams = 1;
-                }
                /* fall through */
        case S_IFDIR:
                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
@@ -1227,6 +1157,15 @@ xfs_ialloc(
        /* now that we have an i_mode we can setup inode ops and unlock */
        xfs_setup_inode(ip);
+        /* now we have set up the vfs inode we can associate the filestream */
+        if (filestreams) {
+                error = xfs_filestream_associate(pip, ip);
+                if (error < 0)
+                        return -error;
+                if (!error)
+                        xfs_iflags_set(ip, XFS_IFILESTREAM);
+        }
        *ipp = ip;
        return 0;
 }
@@ -1383,8 +1322,8 @@ xfs_itrunc_trace(
 * direct I/O with the truncate operation.  Also, because we hold
 * the IOLOCK in exclusive mode, we prevent new direct I/Os from being
 * started until the truncate completes and drops the lock. Essentially,
- * the vn_iowait() call forms an I/O barrier that provides strict ordering
+ * the xfs_ioend_wait() call forms an I/O barrier that provides strict
- * between direct I/Os and the truncate operation.
+ * ordering between direct I/Os and the truncate operation.
 *
 * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
 * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
@@ -1415,7 +1354,7 @@ xfs_itruncate_start(
        /* wait for the completion of any pending DIOs */
        if (new_size == 0 || new_size < ip->i_size)
-                vn_iowait(ip);
+                xfs_ioend_wait(ip);
        /*
         * Call toss_pages or flushinval_pages to get rid of pages
@@ -1726,8 +1665,14 @@ xfs_itruncate_finish(
                xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
                xfs_trans_ihold(ntp, ip);
-                if (!error)
+                if (error)
-                        error = xfs_trans_reserve(ntp, 0,
+                        return error;
+                /*
+                 * transaction commit worked ok so we can drop the extra ticket
+                 * reference that we gained in xfs_trans_dup()
+                 */
+                xfs_log_ticket_put(ntp->t_ticket);
+                error = xfs_trans_reserve(ntp, 0,
                                        XFS_ITRUNCATE_LOG_RES(mp), 0,
                                        XFS_TRANS_PERM_LOG_RES,
                                        XFS_ITRUNCATE_LOG_COUNT);
@@ -1781,13 +1726,10 @@ xfs_iunlink(
        xfs_dinode_t    *dip;
        xfs_buf_t       *agibp;
        xfs_buf_t       *ibp;
-        xfs_agnumber_t  agno;
-        xfs_daddr_t     agdaddr;
        xfs_agino_t     agino;
        short           bucket_index;
        int             offset;
        int             error;
-        int             agi_ok;
        ASSERT(ip->i_d.di_nlink == 0);
        ASSERT(ip->i_d.di_mode != 0);
@@ -1795,31 +1737,15 @@ xfs_iunlink(
        mp = tp->t_mountp;
-        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
        /*
         * Get the agi buffer first.  It ensures lock ordering
         * on the list.
         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+        error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
-                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
        if (error)
                return error;
-        /*
-         * Validate the magic number of the agi block.
-         */
        agi = XFS_BUF_TO_AGI(agibp);
-        agi_ok =
-                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
-        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
-                        XFS_RANDOM_IUNLINK))) {
-                XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
-                xfs_trans_brelse(tp, agibp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        /*
         * Get the index into the agi hash table for the
         * list this inode will go on.
@@ -1837,14 +1763,14 @@ xfs_iunlink(
                 * Here we put the head pointer into our next pointer,
                 * and then we fall through to point the head at us.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error)
                        return error;
                ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
                /* both on-disk, don't endian flip twice */
                dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
-                offset = ip->i_boffset +
+                offset = ip->i_imap.im_boffset +
                        offsetof(xfs_dinode_t, di_next_unlinked);
                xfs_trans_inode_buf(tp, ibp);
                xfs_trans_log_buf(tp, ibp, offset,
@@ -1879,7 +1805,6 @@ xfs_iunlink_remove(
        xfs_buf_t       *agibp;
        xfs_buf_t       *ibp;
        xfs_agnumber_t  agno;
-        xfs_daddr_t     agdaddr;
        xfs_agino_t     agino;
        xfs_agino_t     next_agino;
        xfs_buf_t       *last_ibp;
@@ -1887,45 +1812,20 @@ xfs_iunlink_remove(
        short           bucket_index;
        int             offset, last_offset = 0;
        int             error;
-        int             agi_ok;
-        /*
-         * First pull the on-disk inode from the AGI unlinked list.
-         */
        mp = tp->t_mountp;
        agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
-        agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
        /*
         * Get the agi buffer first.  It ensures lock ordering
         * on the list.
         */
-        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
+        error = xfs_read_agi(mp, tp, agno, &agibp);
-                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+        if (error)
-        if (error) {
-                cmn_err(CE_WARN,
-                        "xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.  Returning error.",
-                        error, mp->m_fsname);
                return error;
-        }
-        /*
-         * Validate the magic number of the agi block.
-         */
        agi = XFS_BUF_TO_AGI(agibp);
-        agi_ok =
-                be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
-        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
-                        XFS_RANDOM_IUNLINK_REMOVE))) {
-                XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
-                                     mp, agi);
-                xfs_trans_brelse(tp, agibp);
-                cmn_err(CE_WARN,
-                        "xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.  Returning EFSCORRUPTED.",
-                         mp->m_fsname);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        /*
         * Get the index into the agi hash table for the
         * list this inode will go on.
@@ -1945,7 +1845,7 @@ xfs_iunlink_remove(
                 * of dealing with the buffer when there is no need to
                 * change it.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -1956,7 +1856,7 @@ xfs_iunlink_remove(
                ASSERT(next_agino != 0);
                if (next_agino != NULLAGINO) {
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        offset = ip->i_boffset +
+                        offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
@@ -1992,7 +1892,7 @@ xfs_iunlink_remove(
                        }
                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
-                                            &last_ibp, &last_offset);
+                                            &last_ibp, &last_offset, 0);
                        if (error) {
                                cmn_err(CE_WARN,
                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
@@ -2007,7 +1907,7 @@ xfs_iunlink_remove(
                 * Now last_ibp points to the buffer previous to us on
                 * the unlinked list.  Pull us from the list.
                 */
-                error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+                error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
                if (error) {
                        cmn_err(CE_WARN,
                                "xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
@@ -2019,7 +1919,7 @@ xfs_iunlink_remove(
                ASSERT(next_agino != agino);
                if (next_agino != NULLAGINO) {
                        dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        offset = ip->i_boffset +
+                        offset = ip->i_imap.im_boffset +
                                offsetof(xfs_dinode_t, di_next_unlinked);
                        xfs_trans_inode_buf(tp, ibp);
                        xfs_trans_log_buf(tp, ibp, offset,
@@ -2160,9 +2060,9 @@ xfs_ifree_cluster(
                                iip = (xfs_inode_log_item_t *)lip;
                                ASSERT(iip->ili_logged == 1);
                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                                spin_lock(&mp->m_ail_lock);
+                                xfs_trans_ail_copy_lsn(mp->m_ail,
-                                iip->ili_flush_lsn = iip->ili_item.li_lsn;
+                                                        &iip->ili_flush_lsn,
-                                spin_unlock(&mp->m_ail_lock);
+                                                        &iip->ili_item.li_lsn);
                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
                                pre_flushed++;
                        }
@@ -2183,9 +2083,8 @@ xfs_ifree_cluster(
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
                        iip->ili_logged = 1;
-                        spin_lock(&mp->m_ail_lock);
+                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-                        iip->ili_flush_lsn = iip->ili_item.li_lsn;
+                                                &iip->ili_item.li_lsn);
-                        spin_unlock(&mp->m_ail_lock);
                        xfs_buf_attach_iodone(bp,
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
@@ -2263,7 +2162,7 @@ xfs_ifree(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+        error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
        if (error)
                return error;
@@ -2279,7 +2178,7 @@ xfs_ifree(
        * This is a temporary hack that would require a proper fix
        * in the future.
        */
-        dip->di_core.di_mode = 0;
+        dip->di_mode = 0;
        if (delete) {
                xfs_ifree_cluster(ip, tp, first_ino);
@@ -2312,9 +2211,10 @@ xfs_iroot_realloc(
        int                     rec_diff,
        int                     whichfork)
 {
+        struct xfs_mount        *mp = ip->i_mount;
        int                     cur_max;
        xfs_ifork_t             *ifp;
-        xfs_bmbt_block_t        *new_broot;
+        struct xfs_btree_block  *new_broot;
        int                     new_max;
        size_t                  new_size;
        char                    *np;
@@ -2335,8 +2235,7 @@ xfs_iroot_realloc(
                 */
                if (ifp->if_broot_bytes == 0) {
                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
-                        ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
+                        ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
-                                                                     KM_SLEEP);
                        ifp->if_broot_bytes = (int)new_size;
                        return;
                }
@@ -2347,18 +2246,16 @@ xfs_iroot_realloc(
                 * location.  The records don't change location because
                 * they are kept butted up against the btree block header.
                 */
-                cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+                cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
                new_max = cur_max + rec_diff;
                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
-                ifp->if_broot = (xfs_bmbt_block_t *)
+                ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
-                  kmem_realloc(ifp->if_broot,
-                                new_size,
                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
                                KM_SLEEP);
-                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                      ifp->if_broot_bytes);
+                                                     ifp->if_broot_bytes);
-                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
-                                                      (int)new_size);
+                                                     (int)new_size);
                ifp->if_broot_bytes = (int)new_size;
                ASSERT(ifp->if_broot_bytes <=
                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
@@ -2372,7 +2269,7 @@ xfs_iroot_realloc(
         * records, just get rid of the root and clear the status bit.
         */
        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-        cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+        cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
        new_max = cur_max + rec_diff;
        ASSERT(new_max >= 0);
        if (new_max > 0)
@@ -2380,11 +2277,11 @@ xfs_iroot_realloc(
        else
                new_size = 0;
        if (new_size > 0) {
-                new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
+                new_broot = kmem_alloc(new_size, KM_SLEEP);
                /*
                 * First copy over the btree block header.
                 */
-                memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
+                memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
        } else {
                new_broot = NULL;
                ifp->if_flags &= ~XFS_IFBROOT;
@@ -2397,18 +2294,16 @@ xfs_iroot_realloc(
                /*
                 * First copy the records.
                 */
-                op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
+                op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
-                                                     ifp->if_broot_bytes);
+                np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
-                np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
-                                                     (int)new_size);
                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
                /*
                 * Then copy the pointers.
                 */
-                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+                op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
-                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
+                np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
                                                     (int)new_size);
                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
        }
@@ -2511,64 +2406,6 @@ xfs_idata_realloc(
        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
 }
-/*
- * Map inode to disk block and offset.
- *
- * mp -- the mount point structure for the current file system
- * tp -- the current transaction
- * ino -- the inode number of the inode to be located
- * imap -- this structure is filled in with the information necessary
- *       to retrieve the given inode from disk
- * flags -- flags to pass to xfs_dilocate indicating whether or not
- *       lookups in the inode btree were OK or not
- */
-int
-xfs_imap(
-        xfs_mount_t     *mp,
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        xfs_imap_t      *imap,
-        uint            flags)
-{
-        xfs_fsblock_t   fsbno;
-        int             len;
-        int             off;
-        int             error;
-        fsbno = imap->im_blkno ?
-                XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
-        error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
-        if (error)
-                return error;
-        imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
-        imap->im_len = XFS_FSB_TO_BB(mp, len);
-        imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
-        imap->im_ioffset = (ushort)off;
-        imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
-        /*
-         * If the inode number maps to a block outside the bounds
-         * of the file system then return NULL rather than calling
-         * read_buf and panicing when we get an error from the
-         * driver.
-         */
-        if ((imap->im_blkno + imap->im_len) >
-            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
-                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                        "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
-                        " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
-                        (unsigned long long) imap->im_blkno,
-                        (unsigned long long) imap->im_len,
-                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
-                return EINVAL;
-        }
-        return 0;
-}
 void
 xfs_idestroy_fork(
        xfs_inode_t     *ip,
@@ -2613,70 +2450,6 @@ xfs_idestroy_fork(
 }
 /*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot.  It must also free the lock
- * associated with the inode.
- */
-void
-xfs_idestroy(
-        xfs_inode_t     *ip)
-{
-        switch (ip->i_d.di_mode & S_IFMT) {
-        case S_IFREG:
-        case S_IFDIR:
-        case S_IFLNK:
-                xfs_idestroy_fork(ip, XFS_DATA_FORK);
-                break;
-        }
-        if (ip->i_afp)
-                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-        mrfree(&ip->i_lock);
-        mrfree(&ip->i_iolock);
-#ifdef XFS_INODE_TRACE
-        ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BMBT_TRACE
-        ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-        ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(ip->i_dir_trace);
-#endif
-        if (ip->i_itemp) {
-                /*
-                 * Only if we are shutting down the fs will we see an
-                 * inode still in the AIL. If it is there, we should remove
-                 * it to prevent a use-after-free from occurring.
-                 */
-                xfs_mount_t     *mp = ip->i_mount;
-                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
-                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
-                if (lip->li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&mp->m_ail_lock);
-                        if (lip->li_flags & XFS_LI_IN_AIL)
-                                xfs_trans_delete_ail(mp, lip);
-                        else
-                                spin_unlock(&mp->m_ail_lock);
-                }
-                xfs_inode_item_destroy(ip);
-        }
-        kmem_zone_free(xfs_inode_zone, ip);
-}
-/*
 * Increment the pin count of the given buffer.
 * This value is protected by ipinlock spinlock in the mount structure.
 */
@@ -2880,7 +2653,7 @@ xfs_iflush_fork(
                        ASSERT(ifp->if_broot_bytes <=
                               (XFS_IFORK_SIZE(ip, whichfork) +
                                XFS_BROOT_SIZE_ADJ));
-                        xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
+                        xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
                                (xfs_bmdr_block_t *)cp,
                                XFS_DFORK_SIZE(dip, mp, whichfork));
                }
@@ -2889,15 +2662,16 @@ xfs_iflush_fork(
        case XFS_DINODE_FMT_DEV:
                if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
                        ASSERT(whichfork == XFS_DATA_FORK);
-                        dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev);
+                        xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
                }
                break;
        case XFS_DINODE_FMT_UUID:
                if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
                        ASSERT(whichfork == XFS_DATA_FORK);
-                        memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
+                        memcpy(XFS_DFORK_DPTR(dip),
-                                sizeof(uuid_t));
+                               &ip->i_df.if_u2.if_uuid,
+                               sizeof(uuid_t));
                }
                break;
@@ -3030,7 +2804,6 @@ cluster_corrupt_out:
                        XFS_BUF_CLR_BDSTRAT_FUNC(bp);
                        XFS_BUF_UNDONE(bp);
                        XFS_BUF_STALE(bp);
-                        XFS_BUF_SHUT(bp);
                        XFS_BUF_ERROR(bp,EIO);
                        xfs_biodone(bp);
                } else {
@@ -3172,7 +2945,7 @@ xfs_iflush(
        /*
         * Get the buffer containing the on-disk inode.
         */
-        error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
+        error = xfs_itobp(mp, NULL, ip, &dip, &bp,
                                noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
        if (error || !bp) {
                xfs_ifunlock(ip);
@@ -3253,7 +3026,7 @@ xfs_iflush_int(
        }
        /* set *dip = inode's place in the buffer */
-        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
        /*
         * Clear i_update_core before copying out the data.
@@ -3275,11 +3048,11 @@ xfs_iflush_int(
         */
        xfs_synchronize_atime(ip);
-        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC,
+        if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
                xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
                    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
-                        ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip);
+                        ip->i_ino, be16_to_cpu(dip->di_magic), dip);
                goto corrupt_out;
        }
        if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
@@ -3342,7 +3115,7 @@ xfs_iflush_int(
         * because if the inode is dirty at all the core must
         * be.
         */
-        xfs_dinode_to_disk(&dip->di_core, &ip->i_d);
+        xfs_dinode_to_disk(dip, &ip->i_d);
        /* Wrap, we never let the log put out DI_MAX_FLUSH */
        if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3354,28 +3127,27 @@ xfs_iflush_int(
         * convert back to the old inode format.  If the superblock version
         * has been updated, then make the conversion permanent.
         */
-        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
-               xfs_sb_version_hasnlink(&mp->m_sb));
+        if (ip->i_d.di_version == 1) {
-        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
                         */
                        ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
-                        dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink);
+                        dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
                } else {
                        /*
                         * The superblock version has already been bumped,
                         * so just make the conversion to the new inode
                         * format permanent.
                         */
-                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                        ip->i_d.di_version = 2;
-                        dip->di_core.di_version =  XFS_DINODE_VERSION_2;
+                        dip->di_version = 2;
                        ip->i_d.di_onlink = 0;
-                        dip->di_core.di_onlink = 0;
+                        dip->di_onlink = 0;
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
-                        memset(&(dip->di_core.di_pad[0]), 0,
+                        memset(&(dip->di_pad[0]), 0,
-                              sizeof(dip->di_core.di_pad));
+                              sizeof(dip->di_pad));
                        ASSERT(ip->i_d.di_projid == 0);
                }
        }
@@ -3418,10 +3190,8 @@ xfs_iflush_int(
                iip->ili_format.ilf_fields = 0;
                iip->ili_logged = 1;
-                ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+                xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
-                spin_lock(&mp->m_ail_lock);
+                                        &iip->ili_item.li_lsn);
-                iip->ili_flush_lsn = iip->ili_item.li_lsn;
-                spin_unlock(&mp->m_ail_lock);
                /*
                 * Attach the function xfs_iflush_done to the inode's
@@ -3459,45 +3229,8 @@ corrupt_out:
 }
-/*
- * Flush all inactive inodes in mp.
- */
-void
-xfs_iflush_all(
-        xfs_mount_t     *mp)
-{
-        xfs_inode_t     *ip;
- again:
-        XFS_MOUNT_ILOCK(mp);
-        ip = mp->m_inodes;
-        if (ip == NULL)
-                goto out;
-        do {
-                /* Make sure we skip markers inserted by sync */
-                if (ip->i_mount == NULL) {
-                        ip = ip->i_mnext;
-                        continue;
-                }
-                if (!VFS_I(ip)) {
-                        XFS_MOUNT_IUNLOCK(mp);
-                        xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
-                        goto again;
-                }
-                ASSERT(vn_count(VFS_I(ip)) == 0);
-                ip = ip->i_mnext;
-        } while (ip != mp->m_inodes);
- out:
-        XFS_MOUNT_IUNLOCK(mp);
-}
 #ifdef XFS_ILOCK_TRACE
-ktrace_t        *xfs_ilock_trace_buf;
 void
 xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
 {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6be310d41daf..1f175fa34b22 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -19,8 +19,7 @@
 #define __XFS_INODE_H__
 struct xfs_dinode;
-struct xfs_dinode_core;
+struct xfs_inode;
 /*
 * Fork identifiers.
@@ -63,7 +62,7 @@ typedef struct xfs_ext_irec {
 typedef struct xfs_ifork {
        int                     if_bytes;       /* bytes in if_u1 */
        int                     if_real_bytes;  /* bytes allocated in if_u1 */
-        xfs_bmbt_block_t        *if_broot;      /* file's incore btree root */
+        struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
        unsigned char           if_ext_max;     /* max # of extent records */
@@ -84,52 +83,14 @@ typedef struct xfs_ifork {
 } xfs_ifork_t;
 /*
- * Flags for xfs_ichgtime().
+ * Inode location information.  Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
 */
-#define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+struct xfs_imap {
-#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+        xfs_daddr_t     im_blkno;       /* starting BB of inode chunk */
+        ushort          im_len;         /* length in BBs of inode chunk */
-/*
+        ushort          im_boffset;     /* inode offset in block in bytes */
- * Per-fork incore inode flags.
+};
- */
-#define XFS_IFINLINE    0x01    /* Inline data is read in */
-#define XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
-#define XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
-#define XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
-/*
- * Flags for xfs_itobp(), xfs_imap() and xfs_dilocate().
- */
-#define XFS_IMAP_LOOKUP         0x1
-#define XFS_IMAP_BULKSTAT       0x2
-#ifdef __KERNEL__
-struct bhv_desc;
-struct cred;
-struct ktrace;
-struct xfs_buf;
-struct xfs_bmap_free;
-struct xfs_bmbt_irec;
-struct xfs_bmbt_block;
-struct xfs_inode;
-struct xfs_inode_log_item;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot;
-#if defined(XFS_ILOCK_TRACE)
-#define XFS_ILOCK_KTRACE_SIZE   32
-extern ktrace_t *xfs_ilock_trace_buf;
-extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
-#else
-#define xfs_ilock_trace(i,n,f,ra)
-#endif
-typedef struct dm_attrs_s {
-        __uint32_t      da_dmevmask;    /* DMIG event mask */
-        __uint16_t      da_dmstate;     /* DMIG state info */
-        __uint16_t      da_pad;         /* DMIG extra padding */
-} dm_attrs_t;
 /*
 * This is the xfs in-core inode structure.
@@ -160,7 +121,7 @@ typedef struct xfs_ictimestamp {
 } xfs_ictimestamp_t;
 /*
- * NOTE:  This structure must be kept identical to struct xfs_dinode_core
+ * NOTE:  This structure must be kept identical to struct xfs_dinode
 *        in xfs_dinode.h except for the endianess annotations.
 */
 typedef struct xfs_icdinode {
@@ -191,27 +152,97 @@ typedef struct xfs_icdinode {
        __uint32_t      di_gen;         /* generation number */
 } xfs_icdinode_t;
-typedef struct {
+/*
-        struct xfs_inode        *ip_mnext;      /* next inode in mount list */
+ * Flags for xfs_ichgtime().
-        struct xfs_inode        *ip_mprev;      /* ptr to prev inode */
+ */
-        struct xfs_mount        *ip_mount;      /* fs mount struct ptr */
+#define XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-} xfs_iptr_t;
+#define XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE    0x01    /* Inline data is read in */
+#define XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
+#define XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
+#define XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
+/*
+ * Fork handling.
+ */
+#define XFS_IFORK_Q(ip)                 ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)              ((int)((ip)->i_d.di_forkoff << 3))
+#define XFS_IFORK_PTR(ip,w)             \
+        ((w) == XFS_DATA_FORK ? \
+                &(ip)->i_df : \
+                (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+        (XFS_IFORK_Q(ip) ? \
+                XFS_IFORK_BOFF(ip) : \
+                XFS_LITINO((ip)->i_mount))
+#define XFS_IFORK_ASIZE(ip) \
+        (XFS_IFORK_Q(ip) ? \
+                XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+                0)
+#define XFS_IFORK_SIZE(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                XFS_IFORK_DSIZE(ip) : \
+                XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (ip)->i_d.di_format : \
+                (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((ip)->i_d.di_format = (n)) : \
+                ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+        ((w) == XFS_DATA_FORK ? \
+                (ip)->i_d.di_nextents : \
+                (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+        ((w) == XFS_DATA_FORK ? \
+                ((ip)->i_d.di_nextents = (n)) : \
+                ((ip)->i_d.di_anextents = (n)))
+#ifdef __KERNEL__
+struct bhv_desc;
+struct cred;
+struct ktrace;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+#if defined(XFS_ILOCK_TRACE)
+#define XFS_ILOCK_KTRACE_SIZE   32
+extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
+#else
+#define xfs_ilock_trace(i,n,f,ra)
+#endif
+typedef struct dm_attrs_s {
+        __uint32_t      da_dmevmask;    /* DMIG event mask */
+        __uint16_t      da_dmstate;     /* DMIG state info */
+        __uint16_t      da_pad;         /* DMIG extra padding */
+} dm_attrs_t;
 typedef struct xfs_inode {
        /* Inode linking and identification information. */
-        struct xfs_inode        *i_mnext;       /* next inode in mount list */
-        struct xfs_inode        *i_mprev;       /* ptr to prev inode */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
-        struct list_head        i_reclaim;      /* reclaim list */
-        struct inode            *i_vnode;       /* vnode backpointer */
        struct xfs_dquot        *i_udquot;      /* user dquot */
        struct xfs_dquot        *i_gdquot;      /* group dquot */
        /* Inode location stuff */
        xfs_ino_t               i_ino;          /* inode number (agno/agino)*/
-        xfs_daddr_t             i_blkno;        /* blkno of inode buffer */
+        struct xfs_imap         i_imap;         /* location for xfs_imap() */
-        ushort                  i_len;          /* len of inode buffer */
-        ushort                  i_boffset;      /* off of inode in buffer */
        /* Extent information. */
        xfs_ifork_t             *i_afp;         /* attribute fork pointer */
@@ -230,7 +261,6 @@ typedef struct xfs_inode {
        unsigned short          i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
        unsigned char           i_update_size;  /* di_size field is dirty */
-        unsigned int            i_gen;          /* generation count */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
@@ -238,6 +268,10 @@ typedef struct xfs_inode {
        xfs_fsize_t             i_size;         /* in-memory size */
        xfs_fsize_t             i_new_size;     /* size when write completes */
        atomic_t                i_iocount;      /* outstanding I/O count */
+        /* VFS inode */
+        struct inode            i_vnode;        /* embedded VFS inode */
        /* Trace buffers per inode. */
 #ifdef XFS_INODE_TRACE
        struct ktrace           *i_trace;       /* general inode trace */
@@ -245,7 +279,7 @@ typedef struct xfs_inode {
 #ifdef XFS_BMAP_TRACE
        struct ktrace           *i_xtrace;      /* inode extent list trace */
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        struct ktrace           *i_btrace;      /* inode bmap btree trace */
 #endif
 #ifdef XFS_RW_TRACE
@@ -265,13 +299,30 @@ typedef struct xfs_inode {
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
-        return (struct xfs_inode *)inode->i_private;
+        return container_of(inode, struct xfs_inode, i_vnode);
 }
 /* convert from xfs inode to vfs inode */
 static inline struct inode *VFS_I(struct xfs_inode *ip)
 {
-        return (struct inode *)ip->i_vnode;
+        return &ip->i_vnode;
+}
+/*
+ * Get rid of a partially initialized inode.
+ *
+ * We have to go through destroy_inode to make sure allocations
+ * from init_inode_always like the security data are undone.
+ *
+ * We mark the inode bad so that it takes the short cut in
+ * the reclaim path instead of going through the flush path
+ * which doesn't make sense for an inode that has never seen the
+ * light of day.
+ */
+static inline void xfs_destroy_inode(struct xfs_inode *ip)
+{
+        make_bad_inode(VFS_I(ip));
+        return destroy_inode(VFS_I(ip));
 }
 /*
@@ -327,65 +378,36 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
        spin_unlock(&ip->i_flags_lock);
        return ret;
 }
-#endif  /* __KERNEL__ */
 /*
- * Fork handling.
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
 */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+        wait_for_completion(&ip->i_flush);
+}
-#define XFS_IFORK_Q(ip)                 ((ip)->i_d.di_forkoff != 0)
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-#define XFS_IFORK_BOFF(ip)              ((int)((ip)->i_d.di_forkoff << 3))
+{
+        return try_wait_for_completion(&ip->i_flush);
-#define XFS_IFORK_PTR(ip,w)             \
+}
-        ((w) == XFS_DATA_FORK ? \
-                &(ip)->i_df : \
-                (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
-        (XFS_IFORK_Q(ip) ? \
-                XFS_IFORK_BOFF(ip) : \
-                XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
-        (XFS_IFORK_Q(ip) ? \
-                XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
-                0)
-#define XFS_IFORK_SIZE(ip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                XFS_IFORK_DSIZE(ip) : \
-                XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                (ip)->i_d.di_format : \
-                (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
-        ((w) == XFS_DATA_FORK ? \
-                ((ip)->i_d.di_format = (n)) : \
-                ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
-        ((w) == XFS_DATA_FORK ? \
-                (ip)->i_d.di_nextents : \
-                (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
-        ((w) == XFS_DATA_FORK ? \
-                ((ip)->i_d.di_nextents = (n)) : \
-                ((ip)->i_d.di_anextents = (n)))
-#ifdef __KERNEL__
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+        complete(&ip->i_flush);
+}
 /*
 * In-core inode flags.
 */
-#define XFS_IGRIO       0x0001  /* inode used for guaranteed rate i/o */
+#define XFS_IRECLAIM    0x0001  /* we have started reclaiming this inode    */
-#define XFS_IUIOSZ      0x0002  /* inode i/o sizes have been explicitly set */
+#define XFS_ISTALE      0x0002  /* inode has been staled */
-#define XFS_IQUIESCE    0x0004  /* we have started quiescing for this inode */
+#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
-#define XFS_IRECLAIM    0x0008  /* we have started reclaiming this inode    */
+#define XFS_INEW        0x0008  /* inode has just been allocated */
-#define XFS_ISTALE      0x0010  /* inode has been staled */
+#define XFS_IFILESTREAM 0x0010  /* inode is in a filestream directory */
-#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
+#define XFS_ITRUNCATED  0x0020  /* truncated down so flush-on-close */
-#define XFS_INEW        0x0040
-#define XFS_IFILESTREAM 0x0080  /* inode is in a filestream directory */
-#define XFS_IMODIFIED   0x0100  /* XFS inode state possibly differs */
-                                /* to the Linux inode state. */
-#define XFS_ITRUNCATED  0x0200  /* truncated down so flush-on-close */
 /*
 * Flags for inode locking.
@@ -460,16 +482,8 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
         ((pip)->i_d.di_mode & S_ISGID))
 /*
- * Flags for xfs_iget()
- */
-#define XFS_IGET_CREATE         0x1
-#define XFS_IGET_BULKSTAT       0x2
-/*
 * xfs_iget.c prototypes.
 */
-void            xfs_ihash_init(struct xfs_mount *);
-void            xfs_ihash_free(struct xfs_mount *);
 xfs_inode_t     *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
                                  struct xfs_trans *);
 int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
@@ -484,25 +498,13 @@ int		xfs_isilocked(xfs_inode_t *, uint);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
 void            xfs_ireclaim(xfs_inode_t *);
-int             xfs_finish_reclaim(xfs_inode_t *, int, int);
-int             xfs_finish_reclaim_all(struct xfs_mount *, int);
 /*
 * xfs_inode.c prototypes.
 */
-int             xfs_itobp(struct xfs_mount *, struct xfs_trans *,
-                          xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-                          xfs_daddr_t, uint, uint);
-int             xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                          xfs_inode_t **, xfs_daddr_t, uint);
-int             xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
                           xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
                           int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
-void            xfs_dinode_from_disk(struct xfs_icdinode *,
-                                     struct xfs_dinode_core *);
-void            xfs_dinode_to_disk(struct xfs_dinode_core *,
-                                   struct xfs_icdinode *);
 uint            xfs_ip2xflags(struct xfs_inode *);
 uint            xfs_dic2xflags(struct xfs_dinode *);
@@ -513,17 +515,10 @@ int		xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
                                     xfs_fsize_t, int, int);
 int             xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-void            xfs_idestroy_fork(xfs_inode_t *, int);
-void            xfs_idestroy(xfs_inode_t *);
-void            xfs_idata_realloc(xfs_inode_t *, int, int);
-void            xfs_iextract(xfs_inode_t *);
 void            xfs_iext_realloc(xfs_inode_t *, int, int);
-void            xfs_iroot_realloc(xfs_inode_t *, int, int);
 void            xfs_ipin(xfs_inode_t *);
 void            xfs_iunpin(xfs_inode_t *);
-int             xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int);
 int             xfs_iflush(xfs_inode_t *, uint);
-void            xfs_iflush_all(struct xfs_mount *);
 void            xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
@@ -532,6 +527,77 @@ void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void            xfs_synchronize_atime(xfs_inode_t *);
 void            xfs_mark_inode_dirty_sync(xfs_inode_t *);
+#if defined(XFS_INODE_TRACE)
+#define INODE_TRACE_SIZE        16              /* number of trace entries */
+#define INODE_KTRACE_ENTRY      1
+#define INODE_KTRACE_EXIT       2
+#define INODE_KTRACE_HOLD       3
+#define INODE_KTRACE_REF        4
+#define INODE_KTRACE_RELE       5
+extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
+extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
+extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
+extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
+extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
+#define xfs_itrace_entry(ip)    \
+        _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit(ip)     \
+        _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit_tag(ip, tag)    \
+        _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
+#define xfs_itrace_ref(ip)      \
+        _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
+#else
+#define xfs_itrace_entry(a)
+#define xfs_itrace_exit(a)
+#define xfs_itrace_exit_tag(a, b)
+#define xfs_itrace_hold(a, b, c, d)
+#define xfs_itrace_ref(a)
+#define xfs_itrace_rele(a, b, c, d)
+#endif
+#define IHOLD(ip) \
+do { \
+        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+        atomic_inc(&(VFS_I(ip)->i_count)); \
+        xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
+#define IRELE(ip) \
+do { \
+        xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+        iput(VFS_I(ip)); \
+} while (0)
+#endif /* __KERNEL__ */
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE         0x1
+#define XFS_IGET_BULKSTAT       0x2
+int             xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
+                            xfs_ino_t, struct xfs_dinode **,
+                            struct xfs_buf **, int *, uint);
+int             xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+                          struct xfs_inode *, struct xfs_dinode **,
+                          struct xfs_buf **, uint);
+int             xfs_iread(struct xfs_mount *, struct xfs_trans *,
+                          struct xfs_inode *, xfs_daddr_t, uint);
+void            xfs_dinode_from_disk(struct xfs_icdinode *,
+                                     struct xfs_dinode *);
+void            xfs_dinode_to_disk(struct xfs_dinode *,
+                                   struct xfs_icdinode *);
+void            xfs_idestroy_fork(struct xfs_inode *, int);
+void            xfs_idata_realloc(struct xfs_inode *, int, int);
+void            xfs_iroot_realloc(struct xfs_inode *, int, int);
+int             xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int             xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
 xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
 void            xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
                                xfs_bmbt_irec_t *);
@@ -561,7 +627,8 @@ void		xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
 #define xfs_ipincount(ip)       ((unsigned int) atomic_read(&ip->i_pincount))
 #ifdef DEBUG
-void            xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
+void            xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
+                                xfs_fsize_t);
 #else   /* DEBUG */
 #define xfs_isize_check(mp, ip, isize)
 #endif  /* DEBUG */
@@ -576,26 +643,4 @@ extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
-/*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-        wait_for_completion(&ip->i_flush);
-}
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-        return try_wait_for_completion(&ip->i_flush);
-}
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-        complete(&ip->i_flush);
-}
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 97c7452e2620..977c4aec587e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -281,7 +281,7 @@ xfs_inode_item_format(
        xfs_mark_inode_dirty_sync(ip);
        vecp->i_addr = (xfs_caddr_t)&ip->i_d;
-        vecp->i_len  = sizeof(xfs_dinode_core_t);
+        vecp->i_len  = sizeof(struct xfs_icdinode);
        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
        vecp++;
        nvecs++;
@@ -296,9 +296,8 @@ xfs_inode_item_format(
         * has a new version number, then we don't bother converting back.
         */
        mp = ip->i_mount;
-        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
+        ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
-               xfs_sb_version_hasnlink(&mp->m_sb));
+        if (ip->i_d.di_version == 1) {
-        if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
                if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
                        /*
                         * Convert it back.
@@ -311,7 +310,7 @@ xfs_inode_item_format(
                         * so just make the conversion to the new inode
                         * format permanent.
                         */
-                        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+                        ip->i_d.di_version = 2;
                        ip->i_d.di_onlink = 0;
                        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
                }
@@ -932,6 +931,7 @@ xfs_inode_item_init(
        iip->ili_item.li_type = XFS_LI_INODE;
        iip->ili_item.li_ops = &xfs_inode_item_ops;
        iip->ili_item.li_mountp = mp;
+        iip->ili_item.li_ailp = mp->m_ail;
        iip->ili_inode = ip;
        /*
@@ -942,9 +942,9 @@ xfs_inode_item_init(
        iip->ili_format.ilf_type = XFS_LI_INODE;
        iip->ili_format.ilf_ino = ip->i_ino;
-        iip->ili_format.ilf_blkno = ip->i_blkno;
+        iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
-        iip->ili_format.ilf_len = ip->i_len;
+        iip->ili_format.ilf_len = ip->i_imap.im_len;
-        iip->ili_format.ilf_boffset = ip->i_boffset;
+        iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
 }
 /*
@@ -976,9 +976,8 @@ xfs_iflush_done(
        xfs_buf_t               *bp,
        xfs_inode_log_item_t    *iip)
 {
-        xfs_inode_t     *ip;
+        xfs_inode_t             *ip = iip->ili_inode;
+        struct xfs_ail          *ailp = iip->ili_item.li_ailp;
-        ip = iip->ili_inode;
        /*
         * We only want to pull the item from the AIL if it is
@@ -991,15 +990,12 @@ xfs_iflush_done(
         */
        if (iip->ili_logged &&
            (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
-                spin_lock(&ip->i_mount->m_ail_lock);
+                spin_lock(&ailp->xa_lock);
                if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
-                        /*
+                        /* xfs_trans_ail_delete() drops the AIL lock. */
-                         * xfs_trans_delete_ail() drops the AIL lock.
+                        xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
-                         */
-                        xfs_trans_delete_ail(ip->i_mount,
-                                             (xfs_log_item_t*)iip);
                } else {
-                        spin_unlock(&ip->i_mount->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
                }
        }
@@ -1031,21 +1027,20 @@ void
 xfs_iflush_abort(
        xfs_inode_t             *ip)
 {
-        xfs_inode_log_item_t    *iip;
+        xfs_inode_log_item_t    *iip = ip->i_itemp;
        xfs_mount_t             *mp;
        iip = ip->i_itemp;
        mp = ip->i_mount;
        if (iip) {
+                struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&mp->m_ail_lock);
+                        spin_lock(&ailp->xa_lock);
                        if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                                /*
+                                /* xfs_trans_ail_delete() drops the AIL lock. */
-                                 * xfs_trans_delete_ail() drops the AIL lock.
+                                xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
-                                 */
-                                xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip);
                        } else
-                                spin_unlock(&mp->m_ail_lock);
+                                spin_unlock(&ailp->xa_lock);
                }
                iip->ili_logged = 0;
                /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 40513077ab36..1ff04cc323ad 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -112,6 +112,24 @@ typedef struct xfs_inode_log_format_64 {
 #define XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
+#define XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
+static inline int xfs_ilog_fbroot(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+#define XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
+static inline int xfs_ilog_fext(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+#define XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
+static inline int xfs_ilog_fdata(int w)
+{
+        return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
 #ifdef __KERNEL__
 struct xfs_buf;
@@ -148,26 +166,6 @@ typedef struct xfs_inode_log_item {
 } xfs_inode_log_item_t;
-#define XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
-static inline int xfs_ilog_fdata(int w)
-{
-        return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-#endif  /* __KERNEL__ */
-#define XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
-static inline int xfs_ilog_fbroot(int w)
-{
-        return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-#define XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
-static inline int xfs_ilog_fext(int w)
-{
-        return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
 static inline int xfs_inode_clean(xfs_inode_t *ip)
 {
        return (!ip->i_itemp ||
@@ -175,9 +173,6 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
               !ip->i_update_core;
 }
-#ifdef __KERNEL__
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
 extern void xfs_inode_item_destroy(struct xfs_inode *);
 extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67f22b2b44b3..911062cf73a6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -290,7 +290,6 @@ STATIC int
 xfs_iomap_eof_align_last_fsb(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
-        xfs_fsize_t     isize,
        xfs_extlen_t    extsize,
        xfs_fileoff_t   *last_fsb)
 {
@@ -306,14 +305,14 @@ xfs_iomap_eof_align_last_fsb(
         * stripe width and we are allocating past the allocation eof.
         */
        else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
-                (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+                (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
                new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
        /*
         * Roundup the allocation request to a stripe unit (m_dalign) boundary
         * if the file size is >= stripe unit size, and we are allocating past
         * the allocation eof.
         */
-        else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+        else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
                new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
        /*
@@ -403,7 +402,6 @@ xfs_iomap_write_direct(
        xfs_filblks_t   count_fsb, resaligned;
        xfs_fsblock_t   firstfsb;
        xfs_extlen_t    extsz, temp;
-        xfs_fsize_t     isize;
        int             nimaps;
        int             bmapi_flag;
        int             quota_flag;
@@ -426,15 +424,10 @@ xfs_iomap_write_direct(
        rt = XFS_IS_REALTIME_INODE(ip);
        extsz = xfs_get_extsz_hint(ip);
-        isize = ip->i_size;
-        if (ip->i_new_size > isize)
-                isize = ip->i_new_size;
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-        if ((offset + count) > isize) {
+        if ((offset + count) > ip->i_size) {
-                error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
+                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
-                                                        &last_fsb);
                if (error)
                        goto error_out;
        } else {
@@ -559,7 +552,6 @@ STATIC int
 xfs_iomap_eof_want_preallocate(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
-        xfs_fsize_t     isize,
        xfs_off_t       offset,
        size_t          count,
        int             ioflag,
@@ -573,7 +565,7 @@ xfs_iomap_eof_want_preallocate(
        int             n, error, imaps;
        *prealloc = 0;
-        if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize)
+        if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
                return 0;
        /*
@@ -617,7 +609,6 @@ xfs_iomap_write_delay(
        xfs_fileoff_t   ioalign;
        xfs_fsblock_t   firstblock;
        xfs_extlen_t    extsz;
-        xfs_fsize_t     isize;
        int             nimaps;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
        int             prealloc, fsynced = 0;
@@ -637,11 +628,7 @@ xfs_iomap_write_delay(
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
 retry:
-        isize = ip->i_size;
+        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
-        if (ip->i_new_size > isize)
-                isize = ip->i_new_size;
-        error = xfs_iomap_eof_want_preallocate(mp, ip, isize, offset, count,
                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
                return error;
@@ -655,8 +642,7 @@ retry:
        }
        if (prealloc || extsz) {
-                error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
+                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
-                                                        &last_fsb);
                if (error)
                        return error;
        }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf6754a3c5b3..e19d0a8d5618 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -69,7 +69,7 @@ xfs_bulkstat_one_iget(
        }
        ASSERT(ip != NULL);
-        ASSERT(ip->i_blkno != (xfs_daddr_t)0);
+        ASSERT(ip->i_imap.im_blkno != 0);
        dic = &ip->i_d;
@@ -125,13 +125,9 @@ STATIC void
 xfs_bulkstat_one_dinode(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
-        xfs_dinode_t    *dip,           /* dinode inode pointer */
+        xfs_dinode_t    *dic,           /* dinode inode pointer */
        xfs_bstat_t     *buf)           /* return buffer */
 {
-        xfs_dinode_core_t *dic;         /* dinode core info pointer */
-        dic = &dip->di_core;
        /*
         * The inode format changed when we moved the link count and
         * made it 32 bits long.  If this is an old format inode,
@@ -143,7 +139,7 @@ xfs_bulkstat_one_dinode(
         * the new format. We don't change the version number so that we
         * can distinguish this from a real new format inode.
         */
-        if (dic->di_version == XFS_DINODE_VERSION_1) {
+        if (dic->di_version == 1) {
                buf->bs_nlink = be16_to_cpu(dic->di_onlink);
                buf->bs_projid = 0;
        } else {
@@ -162,7 +158,7 @@ xfs_bulkstat_one_dinode(
        buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
        buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
        buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
-        buf->bs_xflags = xfs_dic2xflags(dip);
+        buf->bs_xflags = xfs_dic2xflags(dic);
        buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
        buf->bs_extents = be32_to_cpu(dic->di_nextents);
        buf->bs_gen = be32_to_cpu(dic->di_gen);
@@ -173,7 +169,7 @@ xfs_bulkstat_one_dinode(
        switch (dic->di_format) {
        case XFS_DINODE_FMT_DEV:
-                buf->bs_rdev = be32_to_cpu(dip->di_u.di_dev);
+                buf->bs_rdev = xfs_dinode_get_rdev(dic);
                buf->bs_blksize = BLKDEV_IOSIZE;
                buf->bs_blocks = 0;
                break;
@@ -192,27 +188,34 @@ xfs_bulkstat_one_dinode(
        }
 }
+/* Return 0 on success or positive error */
 STATIC int
 xfs_bulkstat_one_fmt(
        void                    __user *ubuffer,
+        int                     ubsize,
+        int                     *ubused,
        const xfs_bstat_t       *buffer)
 {
+        if (ubsize < sizeof(*buffer))
+                return XFS_ERROR(ENOMEM);
        if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
-                return -EFAULT;
+                return XFS_ERROR(EFAULT);
-        return sizeof(*buffer);
+        if (ubused)
+                *ubused = sizeof(*buffer);
+        return 0;
 }
 /*
 * Return stat information for one inode.
 * Return 0 if ok, else errno.
 */
-int                             /* error status */
+int                                     /* error status */
-xfs_bulkstat_one(
+xfs_bulkstat_one_int(
        xfs_mount_t     *mp,            /* mount point for filesystem */
        xfs_ino_t       ino,            /* inode number to get data for */
        void            __user *buffer, /* buffer to place output in */
        int             ubsize,         /* size of buffer */
-        void            *private_data,  /* my private data */
+        bulkstat_one_fmt_pf formatter,  /* formatter, copy to user */
        xfs_daddr_t     bno,            /* starting bno of inode cluster */
        int             *ubused,        /* bytes used by me */
        void            *dibuff,        /* on-disk inode buffer */
@@ -221,15 +224,12 @@ xfs_bulkstat_one(
        xfs_bstat_t     *buf;           /* return buffer */
        int             error = 0;      /* error value */
        xfs_dinode_t    *dip;           /* dinode inode pointer */
-        bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt;
        dip = (xfs_dinode_t *)dibuff;
        *stat = BULKSTAT_RV_NOTHING;
        if (!buffer || xfs_internal_inum(mp, ino))
                return XFS_ERROR(EINVAL);
-        if (ubsize < sizeof(*buf))
-                return XFS_ERROR(ENOMEM);
        buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
@@ -244,21 +244,34 @@ xfs_bulkstat_one(
                xfs_bulkstat_one_dinode(mp, ino, dip, buf);
        }
-        error = formatter(buffer, buf);
+        error = formatter(buffer, ubsize, ubused, buf);
-        if (error < 0)  {
+        if (error)
-                error = EFAULT;
                goto out_free;
-        }
        *stat = BULKSTAT_RV_DIDONE;
-        if (ubused)
-                *ubused = error;
 out_free:
        kmem_free(buf);
        return error;
 }
+int
+xfs_bulkstat_one(
+        xfs_mount_t     *mp,            /* mount point for filesystem */
+        xfs_ino_t       ino,            /* inode number to get data for */
+        void            __user *buffer, /* buffer to place output in */
+        int             ubsize,         /* size of buffer */
+        void            *private_data,  /* my private data */
+        xfs_daddr_t     bno,            /* starting bno of inode cluster */
+        int             *ubused,        /* bytes used by me */
+        void            *dibuff,        /* on-disk inode buffer */
+        int             *stat)          /* BULKSTAT_RV_... */
+{
+        return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+                                    xfs_bulkstat_one_fmt, bno,
+                                    ubused, dibuff, stat);
+}
 /*
 * Test to see whether we can use the ondisk inode directly, based
 * on the given bulkstat flags, filling in dipp accordingly.
@@ -287,19 +300,19 @@ xfs_bulkstat_use_dinode(
         * to disk yet. This is a temporary hack that would require a proper
         * fix in the future.
         */
-        if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC ||
+        if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
-            !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) ||
+            !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
-            !dip->di_core.di_mode)
+            !dip->di_mode)
                return 0;
        if (flags & BULKSTAT_FG_QUICK) {
                *dipp = dip;
                return 1;
        }
        /* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
-        aformat = dip->di_core.di_aformat;
+        aformat = dip->di_aformat;
        if ((XFS_DFORK_Q(dip) == 0) ||
            (aformat == XFS_DINODE_FMT_LOCAL) ||
-            (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) {
+            (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
                *dipp = dip;
                return 1;
        }
@@ -359,7 +372,6 @@ xfs_bulkstat(
        int                     ubused; /* bytes used by formatter */
        xfs_buf_t               *bp;    /* ptr to on-disk inode cluster buf */
        xfs_dinode_t            *dip;   /* ptr into bp for specific inode */
-        xfs_inode_t             *ip;    /* ptr to in-core inode struct */
        /*
         * Get the last inode value, see if there's nothing to do.
@@ -416,8 +428,7 @@ xfs_bulkstat(
                /*
                 * Allocate and initialize a btree cursor for ialloc btree.
                 */
-                cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
+                cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-                                                (xfs_inode_t *)0, 0);
                irbp = irbuf;
                irbufend = irbuf + nirbuf;
                end_of_ag = 0;
@@ -472,7 +483,7 @@ xfs_bulkstat(
                         * In any case, increment to the next record.
                         */
                        if (!error)
-                                error = xfs_inobt_increment(cur, 0, &tmp);
+                                error = xfs_btree_increment(cur, 0, &tmp);
                } else {
                        /*
                         * Start of ag.  Lookup the first inode chunk.
@@ -539,7 +550,7 @@ xfs_bulkstat(
                         * Set agino to after this chunk and bump the cursor.
                         */
                        agino = gino + XFS_INODES_PER_CHUNK;
-                        error = xfs_inobt_increment(cur, 0, &tmp);
+                        error = xfs_btree_increment(cur, 0, &tmp);
                        cond_resched();
                }
                /*
@@ -586,6 +597,8 @@ xfs_bulkstat(
                                        if (flags & (BULKSTAT_FG_QUICK |
                                                     BULKSTAT_FG_INLINE)) {
+                                                int offset;
                                                ino = XFS_AGINO_TO_INO(mp, agno,
                                                                       agino);
                                                bno = XFS_AGB_TO_DADDR(mp, agno,
@@ -594,21 +607,15 @@ xfs_bulkstat(
                                                /*
                                                 * Get the inode cluster buffer
                                                 */
-                                                ASSERT(xfs_inode_zone != NULL);
-                                                ip = kmem_zone_zalloc(xfs_inode_zone,
-                                                                      KM_SLEEP);
-                                                ip->i_ino = ino;
-                                                ip->i_mount = mp;
-                                                spin_lock_init(&ip->i_flags_lock);
                                                if (bp)
                                                        xfs_buf_relse(bp);
-                                                error = xfs_itobp(mp, NULL, ip,
-                                                                &dip, &bp, bno,
+                                                error = xfs_inotobp(mp, NULL, ino, &dip,
-                                                                XFS_IMAP_BULKSTAT,
+                                                                    &bp, &offset,
-                                                                XFS_BUF_LOCK);
+                                                                    XFS_IGET_BULKSTAT);
                                                if (!error)
-                                                        clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
+                                                        clustidx = offset / mp->m_sb.sb_inodesize;
-                                                kmem_zone_free(xfs_inode_zone, ip);
                                                if (XFS_TEST_ERROR(error != 0,
                                                                   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
                                                                   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
@@ -842,8 +849,7 @@ xfs_inumbers(
                                agino = 0;
                                continue;
                        }
-                        cur = xfs_btree_init_cursor(mp, NULL, agbp, agno,
+                        cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-                                XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
                        error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -887,7 +893,7 @@ xfs_inumbers(
                        bufidx = 0;
                }
                if (left) {
-                        error = xfs_inobt_increment(cur, 0, &tmp);
+                        error = xfs_btree_increment(cur, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                                cur = NULL;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index a1f18fce9b70..1fb04e7deb61 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -71,9 +71,23 @@ xfs_bulkstat_single(
 typedef int (*bulkstat_one_fmt_pf)(  /* used size in bytes or negative error */
        void                    __user *ubuffer, /* buffer to write to */
+        int                     ubsize,          /* remaining user buffer sz */
+        int                     *ubused,         /* bytes used by formatter */
        const xfs_bstat_t       *buffer);        /* buffer to read from */
 int
+xfs_bulkstat_one_int(
+        xfs_mount_t             *mp,
+        xfs_ino_t               ino,
+        void                    __user *buffer,
+        int                     ubsize,
+        bulkstat_one_fmt_pf     formatter,
+        xfs_daddr_t             bno,
+        int                     *ubused,
+        void                    *dibuff,
+        int                     *stat);
+int
 xfs_bulkstat_one(
        xfs_mount_t             *mp,
        xfs_ino_t               ino,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3608a0f0a5f6..f4726f702a9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -100,12 +100,11 @@ STATIC void xlog_ungrant_log_space(xlog_t	 *log,
 /* local ticket functions */
-STATIC xlog_ticket_t    *xlog_ticket_get(xlog_t *log,
+STATIC xlog_ticket_t    *xlog_ticket_alloc(xlog_t *log,
                                         int    unit_bytes,
                                         int    count,
                                         char   clientid,
                                         uint   flags);
-STATIC void             xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
@@ -360,7 +359,7 @@ xfs_log_done(xfs_mount_t	*mp,
                 */
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
                xlog_ungrant_log_space(log, ticket);
-                xlog_ticket_put(log, ticket);
+                xfs_log_ticket_put(ticket);
        } else {
                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
@@ -514,7 +513,7 @@ xfs_log_reserve(xfs_mount_t	 *mp,
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
-                internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
+                internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
                                                  client, flags);
                if (!internal_ticket)
                        return XFS_ERROR(ENOMEM);
@@ -572,12 +571,12 @@ xfs_log_mount(
        /*
         * Initialize the AIL now we have a log.
         */
-        spin_lock_init(&mp->m_ail_lock);
        error = xfs_trans_ail_init(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
                goto error;
        }
+        mp->m_log->l_ailp = mp->m_ail;
        /*
         * skip log recovery on a norecovery mount.  pretend it all
@@ -730,8 +729,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
                atomic_inc(&iclog->ic_refcnt);
-                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
                error = xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
@@ -749,7 +748,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                if (tic) {
                        xlog_trace_loggrant(log, tic, "unmount rec");
                        xlog_ungrant_log_space(log, tic);
-                        xlog_ticket_put(log, tic);
+                        xfs_log_ticket_put(tic);
                }
        } else {
                /*
@@ -768,9 +767,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
                spin_lock(&log->l_icloglock);
                iclog = log->l_iclog;
                atomic_inc(&iclog->ic_refcnt);
-                spin_unlock(&log->l_icloglock);
                xlog_state_want_sync(log, iclog);
+                spin_unlock(&log->l_icloglock);
                error =  xlog_state_release_iclog(log, iclog);
                spin_lock(&log->l_icloglock);
@@ -906,7 +905,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 int
 xfs_log_need_covered(xfs_mount_t *mp)
 {
-        int             needed = 0, gen;
+        int             needed = 0;
        xlog_t          *log = mp->m_log;
        if (!xfs_fs_writable(mp))
@@ -915,7 +914,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
        spin_lock(&log->l_icloglock);
        if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
                (log->l_covered_state == XLOG_STATE_COVER_NEED2))
-                        && !xfs_trans_first_ail(mp, &gen)
+                        && !xfs_trans_ail_tail(log->l_ailp)
                        && xlog_iclogs_empty(log)) {
                if (log->l_covered_state == XLOG_STATE_COVER_NEED)
                        log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -952,7 +951,7 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
        xfs_lsn_t tail_lsn;
        xlog_t    *log = mp->m_log;
-        tail_lsn = xfs_trans_tail_ail(mp);
+        tail_lsn = xfs_trans_ail_tail(mp->m_ail);
        spin_lock(&log->l_grant_lock);
        if (tail_lsn != 0) {
                log->l_tail_lsn = tail_lsn;
@@ -1030,12 +1029,6 @@ xlog_iodone(xfs_buf_t *bp)
        ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
        XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
        aborted = 0;
-        /*
-         * Some versions of cpp barf on the recursive definition of
-         * ic_log -> hic_fields.ic_log and expand ic_log twice when
-         * it is passed through two macros.  Workaround broken cpp.
-         */
        l = iclog->ic_log;
        /*
@@ -1302,7 +1295,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
                XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
                XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
                iclog->ic_bp = bp;
-                iclog->hic_data = bp->b_addr;
+                iclog->ic_data = bp->b_addr;
 #ifdef DEBUG
                log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
 #endif
@@ -1322,7 +1315,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
                atomic_set(&iclog->ic_refcnt, 0);
                spin_lock_init(&iclog->ic_callback_lock);
                iclog->ic_callback_tail = &(iclog->ic_callback);
-                iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
+                iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
                ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
                ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
@@ -1446,7 +1439,7 @@ xlog_grant_push_ail(xfs_mount_t	*mp,
     */
    if (threshold_lsn &&
        !XLOG_FORCED_SHUTDOWN(log))
-            xfs_trans_push_ail(mp, threshold_lsn);
+            xfs_trans_ail_push(log->l_ailp, threshold_lsn);
 }       /* xlog_grant_push_ail */
@@ -1991,7 +1984,9 @@ xlog_write(xfs_mount_t *	mp,
                if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
                    xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
                    record_cnt = data_cnt = 0;
+                    spin_lock(&log->l_icloglock);
                    xlog_state_want_sync(log, iclog);
+                    spin_unlock(&log->l_icloglock);
                    if (commit_iclog) {
                        ASSERT(flags & XLOG_COMMIT_TRANS);
                        *commit_iclog = iclog;
@@ -3200,7 +3195,7 @@ try_again:
 STATIC void
 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 {
-        spin_lock(&log->l_icloglock);
+        ASSERT(spin_is_locked(&log->l_icloglock));
        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
                xlog_state_switch_iclogs(log, iclog, 0);
@@ -3208,10 +3203,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
                ASSERT(iclog->ic_state &
                        (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
        }
+}
-        spin_unlock(&log->l_icloglock);
-}       /* xlog_state_want_sync */
 /*****************************************************************************
@@ -3222,22 +3214,33 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 */
 /*
- * Free a used ticket.
+ * Free a used ticket when it's refcount falls to zero.
 */
-STATIC void
+void
-xlog_ticket_put(xlog_t          *log,
+xfs_log_ticket_put(
-                xlog_ticket_t   *ticket)
+        xlog_ticket_t   *ticket)
 {
-        sv_destroy(&ticket->t_wait);
+        ASSERT(atomic_read(&ticket->t_ref) > 0);
-        kmem_zone_free(xfs_log_ticket_zone, ticket);
+        if (atomic_dec_and_test(&ticket->t_ref)) {
-}       /* xlog_ticket_put */
+                sv_destroy(&ticket->t_wait);
+                kmem_zone_free(xfs_log_ticket_zone, ticket);
+        }
+}
+xlog_ticket_t *
+xfs_log_ticket_get(
+        xlog_ticket_t   *ticket)
+{
+        ASSERT(atomic_read(&ticket->t_ref) > 0);
+        atomic_inc(&ticket->t_ref);
+        return ticket;
+}
 /*
 * Allocate and initialise a new log ticket.
 */
 STATIC xlog_ticket_t *
-xlog_ticket_get(xlog_t          *log,
+xlog_ticket_alloc(xlog_t                *log,
                int             unit_bytes,
                int             cnt,
                char            client,
@@ -3308,6 +3311,7 @@ xlog_ticket_get(xlog_t		*log,
                unit_bytes += 2*BBSIZE;
        }
+        atomic_set(&tic->t_ref, 1);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3323,7 +3327,7 @@ xlog_ticket_get(xlog_t		*log,
        xlog_tic_reset_res(tic);
        return tic;
-}       /* xlog_ticket_get */
+}
 /******************************************************************************
@@ -3452,7 +3456,7 @@ xlog_verify_iclog(xlog_t	 *log,
        ptr = iclog->ic_datap;
        base_ptr = ptr;
        ophead = (xlog_op_header_t *)ptr;
-        xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+        xhdr = iclog->ic_data;
        for (i = 0; i < len; i++) {
                ophead = (xlog_op_header_t *)ptr;
@@ -3558,7 +3562,8 @@ xfs_log_force_umount(
        if (!log ||
            log->l_flags & XLOG_ACTIVE_RECOVERY) {
                mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
-                XFS_BUF_DONE(mp->m_sb_bp);
+                if (mp->m_sb_bp)
+                        XFS_BUF_DONE(mp->m_sb_bp);
                return 0;
        }
@@ -3579,7 +3584,9 @@ xfs_log_force_umount(
        spin_lock(&log->l_icloglock);
        spin_lock(&log->l_grant_lock);
        mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
-        XFS_BUF_DONE(mp->m_sb_bp);
+        if (mp->m_sb_bp)
+                XFS_BUF_DONE(mp->m_sb_bp);
        /*
         * This flag is sort of redundant because of the mount flag, but
         * it's good to maintain the separation between the log and the rest
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d47b91f10822..8a3e84e900a3 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -134,6 +134,7 @@ typedef struct xfs_log_callback {
 #ifdef __KERNEL__
 /* Log manager interfaces */
 struct xfs_mount;
+struct xlog_ticket;
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
                       xfs_log_ticket_t ticket,
                       void             **iclog,
@@ -177,6 +178,9 @@ int	  xfs_log_need_covered(struct xfs_mount *mp);
 void      xlog_iodone(struct xfs_buf *);
+struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+void      xfs_log_ticket_put(struct xlog_ticket *ticket);
 #endif
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index e7d8f84443fa..654167be0efb 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -245,6 +245,7 @@ typedef struct xlog_ticket {
        struct xlog_ticket *t_next;      /*                              :4|8 */
        struct xlog_ticket *t_prev;      /*                              :4|8 */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
+        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
        int                t_unit_res;   /* unit reservation in bytes    : 4  */
        char               t_ocnt;       /* original count               : 1  */
@@ -309,6 +310,16 @@ typedef struct xlog_rec_ext_header {
 } xlog_rec_ext_header_t;
 #ifdef __KERNEL__
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+        xlog_rec_header_t       hic_header;
+        xlog_rec_ext_header_t   hic_xheader;
+        char                    hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
 /*
 * - A log record header is 512 bytes.  There is plenty of room to grow the
 *      xlog_rec_header_t into the reserved space.
@@ -338,7 +349,7 @@ typedef struct xlog_rec_ext_header {
 * We'll put all the read-only and l_icloglock fields in the first cacheline,
 * and move everything else out to subsequent cachelines.
 */
-typedef struct xlog_iclog_fields {
+typedef struct xlog_in_core {
        sv_t                    ic_force_wait;
        sv_t                    ic_write_wait;
        struct xlog_in_core     *ic_next;
@@ -361,41 +372,11 @@ typedef struct xlog_iclog_fields {
        /* reference counts need their own cacheline */
        atomic_t                ic_refcnt ____cacheline_aligned_in_smp;
-} xlog_iclog_fields_t;
+        xlog_in_core_2_t        *ic_data;
+#define ic_header       ic_data->hic_header
-typedef union xlog_in_core2 {
-        xlog_rec_header_t       hic_header;
-        xlog_rec_ext_header_t   hic_xheader;
-        char                    hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
-typedef struct xlog_in_core {
-        xlog_iclog_fields_t     hic_fields;
-        xlog_in_core_2_t        *hic_data;
 } xlog_in_core_t;
 /*
- * Defines to save our code from this glop.
- */
-#define ic_force_wait   hic_fields.ic_force_wait
-#define ic_write_wait   hic_fields.ic_write_wait
-#define ic_next         hic_fields.ic_next
-#define ic_prev         hic_fields.ic_prev
-#define ic_bp           hic_fields.ic_bp
-#define ic_log          hic_fields.ic_log
-#define ic_callback     hic_fields.ic_callback
-#define ic_callback_lock hic_fields.ic_callback_lock
-#define ic_callback_tail hic_fields.ic_callback_tail
-#define ic_trace        hic_fields.ic_trace
-#define ic_size         hic_fields.ic_size
-#define ic_offset       hic_fields.ic_offset
-#define ic_refcnt       hic_fields.ic_refcnt
-#define ic_bwritecnt    hic_fields.ic_bwritecnt
-#define ic_state        hic_fields.ic_state
-#define ic_datap        hic_fields.ic_datap
-#define ic_header       hic_data->hic_header
-/*
 * The reservation head lsn is not made up of a cycle number and block number.
 * Instead, it uses a cycle number and byte number.  Logs don't expect to
 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -404,6 +385,7 @@ typedef struct xlog_in_core {
 typedef struct log {
        /* The following fields don't need locking */
        struct xfs_mount        *l_mp;          /* mount point */
+        struct xfs_ail          *l_ailp;        /* AIL log is working with */
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 70e3ba32e6be..35cca98bd94c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -36,7 +36,6 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_imap.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_log_priv.h"
@@ -54,10 +53,8 @@ STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
                                               xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void     xlog_recover_check_summary(xlog_t *);
-STATIC void     xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
 #else
 #define xlog_recover_check_summary(log)
-#define xlog_recover_check_ail(mp, lip, gen)
 #endif
@@ -270,21 +267,16 @@ STATIC void
 xlog_recover_iodone(
        struct xfs_buf  *bp)
 {
-        xfs_mount_t     *mp;
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
        if (XFS_BUF_GETERROR(bp)) {
                /*
                 * We're not going to bother about retrying
                 * this during recovery. One strike!
                 */
-                mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
                xfs_ioerror_alert("xlog_recover_iodone",
-                                  mp, bp, XFS_BUF_ADDR(bp));
+                                  bp->b_mount, bp, XFS_BUF_ADDR(bp));
-                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+                xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
        }
-        XFS_BUF_SET_FSPRIVATE(bp, NULL);
+        bp->b_mount = NULL;
        XFS_BUF_CLR_IODONE_FUNC(bp);
        xfs_biodone(bp);
 }
@@ -2228,9 +2220,8 @@ xlog_recover_do_buffer_trans(
                XFS_BUF_STALE(bp);
                error = xfs_bwrite(mp, bp);
        } else {
-                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+                bp->b_mount = mp;
-                XFS_BUF_SET_FSPRIVATE(bp, mp);
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        }
@@ -2247,7 +2238,6 @@ xlog_recover_do_inode_trans(
        xfs_inode_log_format_t  *in_f;
        xfs_mount_t             *mp;
        xfs_buf_t               *bp;
-        xfs_imap_t              imap;
        xfs_dinode_t            *dip;
        xfs_ino_t               ino;
        int                     len;
@@ -2275,54 +2265,35 @@ xlog_recover_do_inode_trans(
        }
        ino = in_f->ilf_ino;
        mp = log->l_mp;
-        if (ITEM_TYPE(item) == XFS_LI_INODE) {
-                imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
-                imap.im_len = in_f->ilf_len;
-                imap.im_boffset = in_f->ilf_boffset;
-        } else {
-                /*
-                 * It's an old inode format record.  We don't know where
-                 * its cluster is located on disk, and we can't allow
-                 * xfs_imap() to figure it out because the inode btrees
-                 * are not ready to be used.  Therefore do not pass the
-                 * XFS_IMAP_LOOKUP flag to xfs_imap().  This will give
-                 * us only the single block in which the inode lives
-                 * rather than its cluster, so we must make sure to
-                 * invalidate the buffer when we write it out below.
-                 */
-                imap.im_blkno = 0;
-                error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
-                if (error)
-                        goto error;
-        }
        /*
         * Inode buffers can be freed, look out for it,
         * and do not replay the inode.
         */
-        if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
+        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
+                                        in_f->ilf_len, 0)) {
                error = 0;
                goto error;
        }
-        bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
+        bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
-                                                                XFS_BUF_LOCK);
+                                in_f->ilf_len, XFS_BUF_LOCK);
        if (XFS_BUF_ISERROR(bp)) {
                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
-                                  bp, imap.im_blkno);
+                                  bp, in_f->ilf_blkno);
                error = XFS_BUF_GETERROR(bp);
                xfs_buf_relse(bp);
                goto error;
        }
        error = 0;
        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
-        dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+        dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
        /*
         * Make sure the place we're flushing out to really looks
         * like an inode!
         */
-        if (unlikely(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC)) {
+        if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
                xfs_buf_relse(bp);
                xfs_fs_cmn_err(CE_ALERT, mp,
                        "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
@@ -2345,12 +2316,12 @@ xlog_recover_do_inode_trans(
        }
        /* Skip replay when the on disk inode is newer than the log one */
-        if (dicp->di_flushiter < be16_to_cpu(dip->di_core.di_flushiter)) {
+        if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
                /*
                 * Deal with the wrap case, DI_MAX_FLUSH is less
                 * than smaller numbers
                 */
-                if (be16_to_cpu(dip->di_core.di_flushiter) == DI_MAX_FLUSH &&
+                if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
                    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
                        /* do nothing */
                } else {
@@ -2410,7 +2381,7 @@ xlog_recover_do_inode_trans(
                error = EFSCORRUPTED;
                goto error;
        }
-        if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
+        if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
                XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
                                     XFS_ERRLEVEL_LOW, mp, dicp);
                xfs_buf_relse(bp);
@@ -2422,23 +2393,24 @@ xlog_recover_do_inode_trans(
        }
        /* The core is in in-core format */
-        xfs_dinode_to_disk(&dip->di_core,
+        xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
-                (xfs_icdinode_t *)item->ri_buf[1].i_addr);
        /* the rest is in on-disk format */
-        if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
+        if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
-                memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
+                memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
-                        item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
+                        item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
-                        item->ri_buf[1].i_len  - sizeof(xfs_dinode_core_t));
+                        item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
        }
        fields = in_f->ilf_fields;
        switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
        case XFS_ILOG_DEV:
-                dip->di_u.di_dev = cpu_to_be32(in_f->ilf_u.ilfu_rdev);
+                xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
                break;
        case XFS_ILOG_UUID:
-                dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
+                memcpy(XFS_DFORK_DPTR(dip),
+                       &in_f->ilf_u.ilfu_uuid,
+                       sizeof(uuid_t));
                break;
        }
@@ -2454,12 +2426,12 @@ xlog_recover_do_inode_trans(
        switch (fields & XFS_ILOG_DFORK) {
        case XFS_ILOG_DDATA:
        case XFS_ILOG_DEXT:
-                memcpy(&dip->di_u, src, len);
+                memcpy(XFS_DFORK_DPTR(dip), src, len);
                break;
        case XFS_ILOG_DBROOT:
-                xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+                xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
-                                 &(dip->di_u.di_bmbt),
+                                 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
                                 XFS_DFORK_DSIZE(dip, mp));
                break;
@@ -2496,8 +2468,8 @@ xlog_recover_do_inode_trans(
                case XFS_ILOG_ABROOT:
                        dest = XFS_DFORK_APTR(dip);
-                        xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
+                        xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
-                                         (xfs_bmdr_block_t*)dest,
+                                         len, (xfs_bmdr_block_t*)dest,
                                         XFS_DFORK_ASIZE(dip, mp));
                        break;
@@ -2512,9 +2484,8 @@ xlog_recover_do_inode_trans(
 write_inode_buffer:
        if (ITEM_TYPE(item) == XFS_LI_INODE) {
-                ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+                ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-                       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+                bp->b_mount = mp;
-                XFS_BUF_SET_FSPRIVATE(bp, mp);
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        } else {
@@ -2645,9 +2616,8 @@ xlog_recover_do_dquot_trans(
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
        ASSERT(dq_f->qlf_size == 2);
-        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+        ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-               XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+        bp->b_mount = mp;
-        XFS_BUF_SET_FSPRIVATE(bp, mp);
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
@@ -2689,11 +2659,11 @@ xlog_recover_do_efi_trans(
        efip->efi_next_extent = efi_formatp->efi_nextents;
        efip->efi_flags |= XFS_EFI_COMMITTED;
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&log->l_ailp->xa_lock);
        /*
-         * xfs_trans_update_ail() drops the AIL lock.
+         * xfs_trans_ail_update() drops the AIL lock.
         */
-        xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn);
+        xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
        return 0;
 }
@@ -2712,12 +2682,12 @@ xlog_recover_do_efd_trans(
        xlog_recover_item_t     *item,
        int                     pass)
 {
-        xfs_mount_t             *mp;
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
        xfs_log_item_t          *lip;
-        int                     gen;
        __uint64_t              efi_id;
+        struct xfs_ail_cursor   cur;
+        struct xfs_ail          *ailp = log->l_ailp;
        if (pass == XLOG_RECOVER_PASS1) {
                return;
@@ -2734,25 +2704,26 @@ xlog_recover_do_efd_trans(
         * Search for the efi with the id in the efd format structure
         * in the AIL.
         */
-        mp = log->l_mp;
+        spin_lock(&ailp->xa_lock);
-        spin_lock(&mp->m_ail_lock);
+        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
-        lip = xfs_trans_first_ail(mp, &gen);
        while (lip != NULL) {
                if (lip->li_type == XFS_LI_EFI) {
                        efip = (xfs_efi_log_item_t *)lip;
                        if (efip->efi_format.efi_id == efi_id) {
                                /*
-                                 * xfs_trans_delete_ail() drops the
+                                 * xfs_trans_ail_delete() drops the
                                 * AIL lock.
                                 */
-                                xfs_trans_delete_ail(mp, lip);
+                                xfs_trans_ail_delete(ailp, lip);
                                xfs_efi_item_free(efip);
-                                return;
+                                spin_lock(&ailp->xa_lock);
+                                break;
                        }
                }
-                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
-        spin_unlock(&mp->m_ail_lock);
+        xfs_trans_ail_cursor_done(ailp, &cur);
+        spin_unlock(&ailp->xa_lock);
 }
 /*
@@ -3036,33 +3007,6 @@ abort_error:
 }
 /*
- * Verify that once we've encountered something other than an EFI
- * in the AIL that there are no more EFIs in the AIL.
- */
-#if defined(DEBUG)
-STATIC void
-xlog_recover_check_ail(
-        xfs_mount_t             *mp,
-        xfs_log_item_t          *lip,
-        int                     gen)
-{
-        int                     orig_gen = gen;
-        do {
-                ASSERT(lip->li_type != XFS_LI_EFI);
-                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
-                /*
-                 * The check will be bogus if we restart from the
-                 * beginning of the AIL, so ASSERT that we don't.
-                 * We never should since we're holding the AIL lock
-                 * the entire time.
-                 */
-                ASSERT(gen == orig_gen);
-        } while (lip != NULL);
-}
-#endif  /* DEBUG */
-/*
 * When this is called, all of the EFIs which did not have
 * corresponding EFDs should be in the AIL.  What we do now
 * is free the extents associated with each one.
@@ -3086,20 +3030,23 @@ xlog_recover_process_efis(
 {
        xfs_log_item_t          *lip;
        xfs_efi_log_item_t      *efip;
-        int                     gen;
-        xfs_mount_t             *mp;
        int                     error = 0;
+        struct xfs_ail_cursor   cur;
+        struct xfs_ail          *ailp;
-        mp = log->l_mp;
+        ailp = log->l_ailp;
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
+        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
-        lip = xfs_trans_first_ail(mp, &gen);
        while (lip != NULL) {
                /*
                 * We're done when we see something other than an EFI.
+                 * There should be no EFIs left in the AIL now.
                 */
                if (lip->li_type != XFS_LI_EFI) {
-                        xlog_recover_check_ail(mp, lip, gen);
+#ifdef DEBUG
+                        for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+                                ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
                        break;
                }
@@ -3108,18 +3055,20 @@ xlog_recover_process_efis(
                 */
                efip = (xfs_efi_log_item_t *)lip;
                if (efip->efi_flags & XFS_EFI_RECOVERED) {
-                        lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                        lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
-                error = xlog_recover_process_efi(mp, efip);
+                error = xlog_recover_process_efi(log->l_mp, efip);
+                spin_lock(&ailp->xa_lock);
                if (error)
-                        return error;
+                        goto out;
-                spin_lock(&mp->m_ail_lock);
+                lip = xfs_trans_ail_cursor_next(ailp, &cur);
-                lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
        }
-        spin_unlock(&mp->m_ail_lock);
+out:
+        xfs_trans_ail_cursor_done(ailp, &cur);
+        spin_unlock(&ailp->xa_lock);
        return error;
 }
@@ -3140,19 +3089,16 @@ xlog_recover_clear_agi_bucket(
        int             error;
        tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
-        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
+        error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
-        if (!error)
+                                  0, 0, 0);
-                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                   XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                                   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
        if (error)
                goto out_abort;
-        error = EINVAL;
+        error = xfs_read_agi(mp, tp, agno, &agibp);
-        agi = XFS_BUF_TO_AGI(agibp);
+        if (error)
-        if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
                goto out_abort;
+        agi = XFS_BUF_TO_AGI(agibp);
        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
        offset = offsetof(xfs_agi_t, agi_unlinked) +
                 (sizeof(xfs_agino_t) * bucket);
@@ -3172,6 +3118,62 @@ out_error:
        return;
 }
+STATIC xfs_agino_t
+xlog_recover_process_one_iunlink(
+        struct xfs_mount                *mp,
+        xfs_agnumber_t                  agno,
+        xfs_agino_t                     agino,
+        int                             bucket)
+{
+        struct xfs_buf                  *ibp;
+        struct xfs_dinode               *dip;
+        struct xfs_inode                *ip;
+        xfs_ino_t                       ino;
+        int                             error;
+        ino = XFS_AGINO_TO_INO(mp, agno, agino);
+        error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+        if (error)
+                goto fail;
+        /*
+         * Get the on disk inode to find the next inode in the bucket.
+         */
+        error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
+        if (error)
+                goto fail_iput;
+        ASSERT(ip->i_d.di_nlink == 0);
+        ASSERT(ip->i_d.di_mode != 0);
+        /* setup for the next pass */
+        agino = be32_to_cpu(dip->di_next_unlinked);
+        xfs_buf_relse(ibp);
+        /*
+         * Prevent any DMAPI event from being sent when the reference on
+         * the inode is dropped.
+         */
+        ip->i_d.di_dmevmask = 0;
+        IRELE(ip);
+        return agino;
+ fail_iput:
+        IRELE(ip);
+ fail:
+        /*
+         * We can't read in the inode this bucket points to, or this inode
+         * is messed up.  Just ditch this bucket of inodes.  We will lose
+         * some inodes and space, but at least we won't hang.
+         *
+         * Call xlog_recover_clear_agi_bucket() to perform a transaction to
+         * clear the inode pointer in the bucket.
+         */
+        xlog_recover_clear_agi_bucket(mp, agno, bucket);
+        return NULLAGINO;
+}
 /*
 * xlog_iunlink_recover
 *
@@ -3192,11 +3194,7 @@ xlog_recover_process_iunlinks(
        xfs_agnumber_t  agno;
        xfs_agi_t       *agi;
        xfs_buf_t       *agibp;
-        xfs_buf_t       *ibp;
-        xfs_dinode_t    *dip;
-        xfs_inode_t     *ip;
        xfs_agino_t     agino;
-        xfs_ino_t       ino;
        int             bucket;
        int             error;
        uint            mp_dmevmask;
@@ -3213,22 +3211,21 @@ xlog_recover_process_iunlinks(
                /*
                 * Find the agi for this ag.
                 */
-                agibp = xfs_buf_read(mp->m_ddev_targp,
+                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                if (error) {
-                                XFS_FSS_TO_BB(mp, 1), 0);
+                        /*
-                if (XFS_BUF_ISERROR(agibp)) {
+                         * AGI is b0rked. Don't process it.
-                        xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
+                         *
-                                log->l_mp, agibp,
+                         * We should probably mark the filesystem as corrupt
-                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
+                         * after we've recovered all the ag's we can....
+                         */
+                        continue;
                }
                agi = XFS_BUF_TO_AGI(agibp);
-                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum));
                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
                        agino = be32_to_cpu(agi->agi_unlinked[bucket]);
                        while (agino != NULLAGINO) {
                                /*
                                 * Release the agi buffer so that it can
                                 * be acquired in the normal course of the
@@ -3236,87 +3233,17 @@ xlog_recover_process_iunlinks(
                                 */
                                xfs_buf_relse(agibp);
-                                ino = XFS_AGINO_TO_INO(mp, agno, agino);
+                                agino = xlog_recover_process_one_iunlink(mp,
-                                error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+                                                        agno, agino, bucket);
-                                ASSERT(error || (ip != NULL));
-                                if (!error) {
-                                        /*
-                                         * Get the on disk inode to find the
-                                         * next inode in the bucket.
-                                         */
-                                        error = xfs_itobp(mp, NULL, ip, &dip,
-                                                        &ibp, 0, 0,
-                                                        XFS_BUF_LOCK);
-                                        ASSERT(error || (dip != NULL));
-                                }
-                                if (!error) {
-                                        ASSERT(ip->i_d.di_nlink == 0);
-                                        /* setup for the next pass */
-                                        agino = be32_to_cpu(
-                                                        dip->di_next_unlinked);
-                                        xfs_buf_relse(ibp);
-                                        /*
-                                         * Prevent any DMAPI event from
-                                         * being sent when the
-                                         * reference on the inode is
-                                         * dropped.
-                                         */
-                                        ip->i_d.di_dmevmask = 0;
-                                        /*
-                                         * If this is a new inode, handle
-                                         * it specially.  Otherwise,
-                                         * just drop our reference to the
-                                         * inode.  If there are no
-                                         * other references, this will
-                                         * send the inode to
-                                         * xfs_inactive() which will
-                                         * truncate the file and free
-                                         * the inode.
-                                         */
-                                        if (ip->i_d.di_mode == 0)
-                                                xfs_iput_new(ip, 0);
-                                        else
-                                                IRELE(ip);
-                                } else {
-                                        /*
-                                         * We can't read in the inode
-                                         * this bucket points to, or
-                                         * this inode is messed up.  Just
-                                         * ditch this bucket of inodes.  We
-                                         * will lose some inodes and space,
-                                         * but at least we won't hang.  Call
-                                         * xlog_recover_clear_agi_bucket()
-                                         * to perform a transaction to clear
-                                         * the inode pointer in the bucket.
-                                         */
-                                        xlog_recover_clear_agi_bucket(mp, agno,
-                                                        bucket);
-                                        agino = NULLAGINO;
-                                }
                                /*
                                 * Reacquire the agibuffer and continue around
-                                 * the loop.
+                                 * the loop. This should never fail as we know
+                                 * the buffer was good earlier on.
                                 */
-                                agibp = xfs_buf_read(mp->m_ddev_targp,
+                                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                                                XFS_AG_DADDR(mp, agno,
+                                ASSERT(error == 0);
-                                                        XFS_AGI_DADDR(mp)),
-                                                XFS_FSS_TO_BB(mp, 1), 0);
-                                if (XFS_BUF_ISERROR(agibp)) {
-                                        xfs_ioerror_alert(
-                                "xlog_recover_process_iunlinks(#2)",
-                                                log->l_mp, agibp,
-                                                XFS_AG_DADDR(mp, agno,
-                                                        XFS_AGI_DADDR(mp)));
-                                }
                                agi = XFS_BUF_TO_AGI(agibp);
-                                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(
-                                        agi->agi_magicnum));
                        }
                }
@@ -3367,7 +3294,6 @@ xlog_pack_data(
        int                     size = iclog->ic_offset + roundoff;
        __be32                  cycle_lsn;
        xfs_caddr_t             dp;
-        xlog_in_core_2_t        *xhdr;
        xlog_pack_data_checksum(log, iclog, size);
@@ -3382,7 +3308,8 @@ xlog_pack_data(
        }
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+                xlog_in_core_2_t *xhdr = iclog->ic_data;
                for ( ; i < BTOBB(size); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3440,7 +3367,6 @@ xlog_unpack_data(
        xlog_t                  *log)
 {
        int                     i, j, k;
-        xlog_in_core_2_t        *xhdr;
        for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3449,7 +3375,7 @@ xlog_unpack_data(
        }
        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                xhdr = (xlog_in_core_2_t *)rhead;
+                xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
                for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -4003,11 +3929,8 @@ xlog_recover_check_summary(
 {
        xfs_mount_t     *mp;
        xfs_agf_t       *agfp;
-        xfs_agi_t       *agip;
        xfs_buf_t       *agfbp;
        xfs_buf_t       *agibp;
-        xfs_daddr_t     agfdaddr;
-        xfs_daddr_t     agidaddr;
        xfs_buf_t       *sbbp;
 #ifdef XFS_LOUD_RECOVERY
        xfs_sb_t        *sbp;
@@ -4016,6 +3939,7 @@ xlog_recover_check_summary(
        __uint64_t      freeblks;
        __uint64_t      itotal;
        __uint64_t      ifree;
+        int             error;
        mp = log->l_mp;
@@ -4023,37 +3947,27 @@ xlog_recover_check_summary(
        itotal = 0LL;
        ifree = 0LL;
        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
-                agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
+                error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
-                agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
+                if (error) {
-                                XFS_FSS_TO_BB(mp, 1), 0);
+                        xfs_fs_cmn_err(CE_ALERT, mp,
-                if (XFS_BUF_ISERROR(agfbp)) {
+                                        "xlog_recover_check_summary(agf)"
-                        xfs_ioerror_alert("xlog_recover_check_summary(agf)",
+                                        "agf read failed agno %d error %d",
-                                                mp, agfbp, agfdaddr);
+                                                        agno, error);
-                }
+                } else {
-                agfp = XFS_BUF_TO_AGF(agfbp);
+                        agfp = XFS_BUF_TO_AGF(agfbp);
-                ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum));
+                        freeblks += be32_to_cpu(agfp->agf_freeblks) +
-                ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum)));
+                                    be32_to_cpu(agfp->agf_flcount);
-                ASSERT(be32_to_cpu(agfp->agf_seqno) == agno);
+                        xfs_buf_relse(agfbp);
-                freeblks += be32_to_cpu(agfp->agf_freeblks) +
-                            be32_to_cpu(agfp->agf_flcount);
-                xfs_buf_relse(agfbp);
-                agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
-                agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
-                                XFS_FSS_TO_BB(mp, 1), 0);
-                if (XFS_BUF_ISERROR(agibp)) {
-                        xfs_ioerror_alert("xlog_recover_check_summary(agi)",
-                                          mp, agibp, agidaddr);
                }
-                agip = XFS_BUF_TO_AGI(agibp);
-                ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum));
-                ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum)));
-                ASSERT(be32_to_cpu(agip->agi_seqno) == agno);
-                itotal += be32_to_cpu(agip->agi_count);
+                error = xfs_read_agi(mp, NULL, agno, &agibp);
-                ifree += be32_to_cpu(agip->agi_freecount);
+                if (!error) {
-                xfs_buf_relse(agibp);
+                        struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
+                        itotal += be32_to_cpu(agi->agi_count);
+                        ifree += be32_to_cpu(agi->agi_freecount);
+                        xfs_buf_relse(agibp);
+                }
        }
        sbbp = xfs_getsb(mp, 0);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 15f5dd22fbb2..3c97c6463a4e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -567,8 +567,6 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 STATIC void
 xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 {
-        int     i;
        mp->m_agfrotor = mp->m_agirotor = 0;
        spin_lock_init(&mp->m_agirotor_lock);
        mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -577,12 +575,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
        mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
        mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
-        mp->m_litino = sbp->sb_inodesize -
+        mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
-                ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
        mp->m_blockmask = sbp->sb_blocksize - 1;
        mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
        mp->m_blockwmask = mp->m_blockwsize - 1;
-        INIT_LIST_HEAD(&mp->m_del_inodes);
        /*
         * Setup for attributes, in case they get created.
@@ -605,24 +601,20 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        }
        ASSERT(mp->m_attroffset < XFS_LITINO(mp));
-        for (i = 0; i < 2; i++) {
+        mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
-                mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+        mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
-                        xfs_alloc, i == 0);
+        mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
-                mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
+        mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
-                        xfs_alloc, i == 0);
-        }
+        mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
-        for (i = 0; i < 2; i++) {
+        mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
-                mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+        mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
-                        xfs_bmbt, i == 0);
+        mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-                mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                        xfs_bmbt, i == 0);
+        mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
-        }
+        mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
-        for (i = 0; i < 2; i++) {
+        mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
-                mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
+        mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
-                        xfs_inobt, i == 0);
-                mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                        xfs_inobt, i == 0);
-        }
        mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
        mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
@@ -1228,6 +1220,16 @@ xfs_unmountfs(
        __uint64_t              resblks;
        int                     error;
+        /*
+         * Release dquot that rootinode, rbmino and rsumino might be holding,
+         * and release the quota inodes.
+         */
+        XFS_QM_UNMOUNT(mp);
+        if (mp->m_rbmip)
+                IRELE(mp->m_rbmip);
+        if (mp->m_rsumip)
+                IRELE(mp->m_rsumip);
        IRELE(mp->m_rootip);
        /*
@@ -1241,7 +1243,7 @@ xfs_unmountfs(
         * need to force the log first.
         */
        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-        xfs_iflush_all(mp);
+        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
        XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
@@ -1288,11 +1290,6 @@ xfs_unmountfs(
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
-        /*
-         * All inodes from this mount point should be freed.
-         */
-        ASSERT(mp->m_inodes == NULL);
        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
                uuid_table_remove(&mp->m_sb.sb_uuid);
@@ -1365,24 +1362,6 @@ xfs_log_sbcount(
        return error;
 }
-STATIC void
-xfs_mark_shared_ro(
-        xfs_mount_t     *mp,
-        xfs_buf_t       *bp)
-{
-        xfs_dsb_t       *sb = XFS_BUF_TO_SBP(bp);
-        __uint16_t      version;
-        if (!(sb->sb_flags & XFS_SBF_READONLY))
-                sb->sb_flags |= XFS_SBF_READONLY;
-        version = be16_to_cpu(sb->sb_versionnum);
-        if ((version & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4 ||
-            !(version & XFS_SB_VERSION_SHAREDBIT))
-                version |= XFS_SB_VERSION_SHAREDBIT;
-        sb->sb_versionnum = cpu_to_be16(version);
-}
 int
 xfs_unmountfs_writesb(xfs_mount_t *mp)
 {
@@ -1398,12 +1377,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                sbp = xfs_getsb(mp, 0);
-                /*
-                 * mark shared-readonly if desired
-                 */
-                if (mp->m_mk_sharedro)
-                        xfs_mark_shared_ro(mp, sbp);
                XFS_BUF_UNDONE(sbp);
                XFS_BUF_UNREAD(sbp);
                XFS_BUF_UNDELAYWRITE(sbp);
@@ -1415,8 +1388,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
                if (error)
                        xfs_ioerror_alert("xfs_unmountfs_writesb",
                                          mp, sbp, XFS_BUF_ADDR(sbp));
-                if (error && mp->m_mk_sharedro)
-                        xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting.  Filesystem may not be marked shared readonly");
                xfs_buf_relse(sbp);
        }
        return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f3c1024b1241..c1e028467327 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_MOUNT_H__
 #define __XFS_MOUNT_H__
 typedef struct xfs_trans_reservations {
        uint    tr_write;       /* extent alloc trans */
        uint    tr_itruncate;   /* truncate trans */
@@ -44,14 +43,16 @@ typedef struct xfs_trans_reservations {
 } xfs_trans_reservations_t;
 #ifndef __KERNEL__
-/*
- * Moved here from xfs_ag.h to avoid reordering header files
- */
 #define XFS_DADDR_TO_AGNO(mp,d) \
        ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
 #define XFS_DADDR_TO_AGBNO(mp,d) \
        ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
-#else
+#else /* __KERNEL__ */
+#include "xfs_sync.h"
 struct cred;
 struct log;
 struct xfs_mount_args;
@@ -62,6 +63,7 @@ struct xfs_extdelta;
 struct xfs_swapext;
 struct xfs_mru_cache;
 struct xfs_nameops;
+struct xfs_ail;
 /*
 * Prototypes and functions for the Data Migration subsystem.
@@ -115,7 +117,7 @@ struct xfs_quotainfo;
 typedef int     (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
 typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
-typedef int     (*xfs_qmunmount_t)(struct xfs_mount *);
+typedef void    (*xfs_qmunmount_t)(struct xfs_mount *);
 typedef void    (*xfs_qmdone_t)(struct xfs_mount *);
 typedef void    (*xfs_dqrele_t)(struct xfs_dquot *);
 typedef int     (*xfs_dqattach_t)(struct xfs_inode *, uint);
@@ -132,7 +134,7 @@ typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
                        struct xfs_dquot **, struct xfs_dquot *);
 typedef int     (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
                        struct xfs_dquot *, struct xfs_dquot *, uint);
-typedef void    (*xfs_dqstatvfs_t)(struct xfs_inode *, bhv_statvfs_t *);
+typedef void    (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
 typedef int     (*xfs_dqsync_t)(struct xfs_mount *, int flags);
 typedef int     (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
@@ -223,18 +225,10 @@ extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
 #endif
-typedef struct xfs_ail {
-        struct list_head        xa_ail;
-        uint                    xa_gen;
-        struct task_struct      *xa_task;
-        xfs_lsn_t               xa_target;
-} xfs_ail_t;
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
-        spinlock_t              m_ail_lock;     /* fs AIL mutex */
+        struct xfs_ail          *m_ail;         /* fs active log item list */
-        xfs_ail_t               m_ail;          /* fs active log item list */
        xfs_sb_t                m_sb;           /* copy of fs superblock */
        spinlock_t              m_sb_lock;      /* sb counter lock */
        struct xfs_buf          *m_sb_bp;       /* buffer for superblock */
@@ -247,10 +241,6 @@ typedef struct xfs_mount {
        xfs_agnumber_t          m_agirotor;     /* last ag dir inode alloced */
        spinlock_t              m_agirotor_lock;/* .. and lock protecting it */
        xfs_agnumber_t          m_maxagi;       /* highest inode alloc group */
-        struct xfs_inode        *m_inodes;      /* active inode list */
-        struct list_head        m_del_inodes;   /* inodes to reclaim */
-        mutex_t                 m_ilock;        /* inode list mutex */
-        uint                    m_ireclaims;    /* count of calls to reclaim*/
        uint                    m_readio_log;   /* min read size log bytes */
        uint                    m_readio_blocks; /* min read size blocks */
        uint                    m_writeio_log;  /* min write size log bytes */
@@ -267,7 +257,6 @@ typedef struct xfs_mount {
        xfs_buftarg_t           *m_ddev_targp;  /* saves taking the address */
        xfs_buftarg_t           *m_logdev_targp;/* ptr to log device */
        xfs_buftarg_t           *m_rtdev_targp; /* ptr to rt device */
-        __uint8_t               m_dircook_elog; /* log d-cookie entry bits */
        __uint8_t               m_blkbit_log;   /* blocklog + NBBY */
        __uint8_t               m_blkbb_log;    /* blocklog - BBSHIFT */
        __uint8_t               m_agno_log;     /* log #ag's */
@@ -276,12 +265,12 @@ typedef struct xfs_mount {
        uint                    m_blockmask;    /* sb_blocksize-1 */
        uint                    m_blockwsize;   /* sb_blocksize in words */
        uint                    m_blockwmask;   /* blockwsize-1 */
-        uint                    m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
+        uint                    m_alloc_mxr[2]; /* max alloc btree records */
-        uint                    m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
+        uint                    m_alloc_mnr[2]; /* min alloc btree records */
-        uint                    m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
+        uint                    m_bmap_dmxr[2]; /* max bmap btree records */
-        uint                    m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
+        uint                    m_bmap_dmnr[2]; /* min bmap btree records */
-        uint                    m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
+        uint                    m_inobt_mxr[2]; /* max inobt btree records */
-        uint                    m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
+        uint                    m_inobt_mnr[2]; /* min inobt btree records */
        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
        uint                    m_in_maxlevels; /* XFS_IN_MAXLEVELS */
@@ -312,9 +301,6 @@ typedef struct xfs_mount {
        int                     m_sinoalign;    /* stripe unit inode alignment */
        int                     m_attr_magicpct;/* 37% of the blocksize */
        int                     m_dir_magicpct; /* 37% of the dir blocksize */
-        __uint8_t               m_mk_sharedro;  /* mark shared ro on unmount */
-        __uint8_t               m_inode_quiesce;/* call quiesce on new inodes.
-                                                   field governed by m_ilock */
        __uint8_t               m_sectbb_log;   /* sectlog - BBSHIFT */
        const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
        int                     m_dirblksize;   /* directory block sz--bytes */
@@ -362,7 +348,6 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_ATTR2         (1ULL << 8)     /* allow use of attr2 format */
 #define XFS_MOUNT_GRPID         (1ULL << 9)     /* group-ID assigned from directory */
 #define XFS_MOUNT_NORECOVERY    (1ULL << 10)    /* no recovery - dirty fs */
-#define XFS_MOUNT_SHARED        (1ULL << 11)    /* shared mount */
 #define XFS_MOUNT_DFLT_IOSIZE   (1ULL << 12)    /* set default i/o size */
 #define XFS_MOUNT_OSYNCISOSYNC  (1ULL << 13)    /* o_sync is REALLY o_sync */
                                                /* osyncisdsync is now default*/
@@ -439,6 +424,16 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 #define xfs_force_shutdown(m,f) \
        xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
+#define SHUTDOWN_META_IO_ERROR  0x0001  /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR   0x0002  /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT   0x0004  /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE 0x0008  /* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ     0x0010  /* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ     0x0020  /* failed all paths to the device */
+#define xfs_test_for_freeze(mp)         ((mp)->m_super->s_frozen)
+#define xfs_wait_for_freeze(mp,l)       vfs_check_frozen((mp)->m_super, (l))
 /*
 * Flags for xfs_mountfs
 */
@@ -508,14 +503,12 @@ typedef struct xfs_mod_sb {
 #define XFS_MOUNT_ILOCK(mp)     mutex_lock(&((mp)->m_ilock))
 #define XFS_MOUNT_IUNLOCK(mp)   mutex_unlock(&((mp)->m_ilock))
-extern void     xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
 extern int      xfs_mountfs(xfs_mount_t *mp);
 extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
 extern void     xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_unmountfs_writesb(xfs_mount_t *);
-extern int      xfs_unmount_flush(xfs_mount_t *, int);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
 extern int      xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
                        int64_t, int);
@@ -525,20 +518,20 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
 extern void     xfs_freesb(xfs_mount_t *);
 extern int      xfs_fs_writable(xfs_mount_t *);
-extern int      xfs_syncsub(xfs_mount_t *, int, int *);
-extern int      xfs_sync_inodes(xfs_mount_t *, int, int *);
-extern xfs_agnumber_t   xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t);
-extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 extern int      xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
-extern int      xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int      xfs_dmops_get(struct xfs_mount *);
 extern void     xfs_dmops_put(struct xfs_mount *);
-extern int      xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int      xfs_qmops_get(struct xfs_mount *);
 extern void     xfs_qmops_put(struct xfs_mount *);
 extern struct xfs_dmops xfs_dmcore_xfs;
 #endif  /* __KERNEL__ */
+extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern xfs_agnumber_t   xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index a294e58db8dd..27f80581520a 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -28,7 +28,6 @@
 #include "xfs_mount.h"
 #include "xfs_quota.h"
 #include "xfs_error.h"
-#include "xfs_clnt.h"
 STATIC struct xfs_dquot *
@@ -131,9 +130,9 @@ static struct xfs_qmops xfs_qmcore_stub = {
 };
 int
-xfs_qmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_qmops_get(struct xfs_mount *mp)
 {
-        if (args->flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)) {
+        if (XFS_IS_QUOTA_RUNNING(mp)) {
 #ifdef CONFIG_XFS_QUOTA
                mp->m_qm_ops = &xfs_qmcore_xfs;
 #else
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 12c4ec775af8..48965ecaa155 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -84,11 +84,9 @@ typedef struct xfs_dqblk {
 #define XFS_DQ_USER             0x0001          /* a user quota */
 #define XFS_DQ_PROJ             0x0002          /* project quota */
 #define XFS_DQ_GROUP            0x0004          /* a group quota */
-#define XFS_DQ_FLOCKED          0x0008          /* flush lock taken */
+#define XFS_DQ_DIRTY            0x0008          /* dquot is dirty */
-#define XFS_DQ_DIRTY            0x0010          /* dquot is dirty */
+#define XFS_DQ_WANT             0x0010          /* for lookup/reclaim race */
-#define XFS_DQ_WANT             0x0020          /* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE         0x0020          /* dq off mplist & hashlist */
-#define XFS_DQ_INACTIVE         0x0040          /* dq off mplist & hashlist */
-#define XFS_DQ_MARKER           0x0080          /* sentinel */
 #define XFS_DQ_ALLTYPES         (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index c903130be7fd..86471bb40fd4 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -42,31 +42,6 @@
 /*
- * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
- * If there are fewer than 4 entries in the array, the empty entries will
- * be at the end and will have NULL pointers in them.
- */
-STATIC void
-xfs_rename_unlock4(
-        xfs_inode_t     **i_tab,
-        uint            lock_mode)
-{
-        int     i;
-        xfs_iunlock(i_tab[0], lock_mode);
-        for (i = 1; i < 4; i++) {
-                if (i_tab[i] == NULL)
-                        break;
-                /*
-                 * Watch out for duplicate entries in the table.
-                 */
-                if (i_tab[i] != i_tab[i-1])
-                        xfs_iunlock(i_tab[i], lock_mode);
-        }
-}
-/*
 * Enter all inodes for a rename transaction into a sorted array.
 */
 STATIC void
@@ -205,19 +180,6 @@ xfs_rename(
        xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
        /*
-         * If we are using project inheritance, we only allow renames
-         * into our tree when the project IDs are the same; else the
-         * tree quota mechanism would be circumvented.
-         */
-        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
-                     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
-                error = XFS_ERROR(EXDEV);
-                xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL);
-                xfs_trans_cancel(tp, cancel_flags);
-                goto std_return;
-        }
-        /*
         * Join all the inodes to the transaction. From this point on,
         * we can rely on either trans_commit or trans_cancel to unlock
         * them.  Note that we need to add a vnode reference to the
@@ -242,6 +204,17 @@ xfs_rename(
        }
        /*
+         * If we are using project inheritance, we only allow renames
+         * into our tree when the project IDs are the same; else the
+         * tree quota mechanism would be circumvented.
+         */
+        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+                     (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+                error = XFS_ERROR(EXDEV);
+                goto error_return;
+        }
+        /*
         * Set up the target.
         */
        if (target_ip == NULL) {
@@ -367,19 +340,11 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
        if (error)
                goto abort_return;
-        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
+        xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-         * Update the generation counts on all the directory inodes
-         * that we're modifying.
-         */
-        src_dp->i_gen++;
        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
+        if (new_parent)
-        if (new_parent) {
-                target_dp->i_gen++;
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-        }
        /*
         * If this is a synchronous mount, make sure that the
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e2f68de16159..edf12c7b834c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -85,7 +85,6 @@ xfs_growfs_rt_alloc(
 {
        xfs_fileoff_t   bno;            /* block number in file */
        xfs_buf_t       *bp;            /* temporary buffer for zeroing */
-        int             cancelflags;    /* flags for xfs_trans_cancel */
        int             committed;      /* transaction committed flag */
        xfs_daddr_t     d;              /* disk block address */
        int             error;          /* error return value */
@@ -96,15 +95,16 @@ xfs_growfs_rt_alloc(
        xfs_bmbt_irec_t map;            /* block map output */
        int             nmap;           /* number of block maps */
        int             resblks;        /* space reservation */
-        xfs_trans_t     *tp;            /* transaction pointer */
        /*
         * Allocate space to the file, as necessary.
         */
        while (oblocks < nblocks) {
+                int             cancelflags = 0;
+                xfs_trans_t     *tp;
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
                resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
-                cancelflags = 0;
                /*
                 * Reserve space & log for one extent added to the file.
                 */
@@ -171,7 +171,9 @@ xfs_growfs_rt_alloc(
                                mp->m_bsize, 0);
                        if (bp == NULL) {
                                error = XFS_ERROR(EIO);
-                                goto error_cancel;
+error_cancel:
+                                xfs_trans_cancel(tp, cancelflags);
+                                goto error;
                        }
                        memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
                        xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
@@ -188,8 +190,6 @@ xfs_growfs_rt_alloc(
                oblocks = map.br_startoff + map.br_blockcount;
        }
        return 0;
-error_cancel:
-        xfs_trans_cancel(tp, cancelflags);
 error:
        return error;
 }
@@ -1856,7 +1856,6 @@ xfs_growfs_rt(
 {
        xfs_rtblock_t   bmbno;          /* bitmap block number */
        xfs_buf_t       *bp;            /* temporary buffer */
-        int             cancelflags;    /* flags for xfs_trans_cancel */
        int             error;          /* error return value */
        xfs_inode_t     *ip;            /* bitmap inode, used as lock */
        xfs_mount_t     *nmp;           /* new (fake) mount structure */
@@ -1872,13 +1871,13 @@ xfs_growfs_rt(
        xfs_extlen_t    rsumblocks;     /* current number of rt summary blks */
        xfs_sb_t        *sbp;           /* old superblock */
        xfs_fsblock_t   sumbno;         /* summary block number */
-        xfs_trans_t     *tp;            /* transaction pointer */
        sbp = &mp->m_sb;
-        cancelflags = 0;
        /*
         * Initial error checking.
         */
+        if (!capable(CAP_SYS_ADMIN))
+                return XFS_ERROR(EPERM);
        if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
            (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
            (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
@@ -1942,6 +1941,9 @@ xfs_growfs_rt(
                     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
             bmbno < nrbmblocks;
             bmbno++) {
+                xfs_trans_t     *tp;
+                int             cancelflags = 0;
                *nmp = *mp;
                nsbp = &nmp->m_sb;
                /*
@@ -1967,16 +1969,15 @@ xfs_growfs_rt(
                 * Start a transaction, get the log reservation.
                 */
                tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
-                cancelflags = 0;
                if ((error = xfs_trans_reserve(tp, 0,
                                XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
-                        break;
+                        goto error_cancel;
                /*
                 * Lock out other callers by grabbing the bitmap inode lock.
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        break;
+                        goto error_cancel;
                ASSERT(ip == mp->m_rbmip);
                /*
                 * Update the bitmap inode's size.
@@ -1990,7 +1991,7 @@ xfs_growfs_rt(
                 */
                if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
                                                XFS_ILOCK_EXCL, &ip)))
-                        break;
+                        goto error_cancel;
                ASSERT(ip == mp->m_rsumip);
                /*
                 * Update the summary inode's size.
@@ -2005,7 +2006,7 @@ xfs_growfs_rt(
                    mp->m_rsumlevels != nmp->m_rsumlevels) {
                        error = xfs_rtcopy_summary(mp, nmp, tp);
                        if (error)
-                                break;
+                                goto error_cancel;
                }
                /*
                 * Update superblock fields.
@@ -2031,8 +2032,11 @@ xfs_growfs_rt(
                bp = NULL;
                error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
                        nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
-                if (error)
+                if (error) {
+error_cancel:
+                        xfs_trans_cancel(tp, cancelflags);
                        break;
+                }
                /*
                 * Mark more blocks free in the superblock.
                 */
@@ -2045,15 +2049,10 @@ xfs_growfs_rt(
                mp->m_rsumsize = nrsumsize;
                error = xfs_trans_commit(tp, 0);
-                if (error) {
+                if (error)
-                        tp = NULL;
                        break;
-                }
        }
-        if (error && tp)
-                xfs_trans_cancel(tp, cancelflags);
        /*
         * Free the fake mp structure.
         */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 3a82576dde9a..36f3a21c54d2 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -406,7 +406,7 @@ xfs_bwrite(
         * XXXsup how does this work for quotas.
         */
        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
-        XFS_BUF_SET_FSPRIVATE3(bp, mp);
+        bp->b_mount = mp;
        XFS_BUF_WRITE(bp);
        if ((error = XFS_bwrite(bp))) {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 3f8cf1587f4c..1ed71916e4c9 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -79,6 +79,7 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_LAZYSBCOUNTBIT  0x00000002      /* Superblk counters */
 #define XFS_SB_VERSION2_RESERVED4BIT    0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
 #define XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -296,30 +297,34 @@ typedef enum {
 #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-#ifdef __KERNEL__
 static inline int xfs_sb_good_version(xfs_sb_t *sbp)
 {
-        return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
+        /* We always support version 1-3 */
-                  (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
+        if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
-                   ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+            sbp->sb_versionnum <= XFS_SB_VERSION_3)
-                    !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
+                return 1;
-                      ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
-                       (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
+        /* We support version 4 if all feature bits are supported */
-                    (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)));
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
-}
+                if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
+                    ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+                     (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
+                        return 0;
+#ifdef __KERNEL__
+                if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+                        return 0;
 #else
-static inline int xfs_sb_good_version(xfs_sb_t *sbp)
+                if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
-{
+                    sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
-        return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
+                        return 0;
-                  (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
+#endif
-                   ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-                    !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
+                return 1;
-                      ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
+        }
-                       (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
-                  (!(sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \
+        return 0;
-                   (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN))));
 }
-#endif /* __KERNEL__ */
 /*
 * Detect a mismatched features2 field.  Older kernels read/wrote
@@ -332,123 +337,127 @@ static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
 static inline unsigned xfs_sb_version_tonew(unsigned v)
 {
-        return ((((v) == XFS_SB_VERSION_1) ? \
+        if (v == XFS_SB_VERSION_1)
-                0 : \
+                return XFS_SB_VERSION_4;
-                (((v) == XFS_SB_VERSION_2) ? \
-                        XFS_SB_VERSION_ATTRBIT : \
+        if (v == XFS_SB_VERSION_2)
-                        (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \
+                return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
-                XFS_SB_VERSION_4);
+        return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
+                XFS_SB_VERSION_NLINKBIT;
 }
 static inline unsigned xfs_sb_version_toold(unsigned v)
 {
-        return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
+        if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
-                0 : \
+                return 0;
-                (((v) & XFS_SB_VERSION_NLINKBIT) ? \
+        if (v & XFS_SB_VERSION_NLINKBIT)
-                        XFS_SB_VERSION_3 : \
+                return XFS_SB_VERSION_3;
-                        (((v) & XFS_SB_VERSION_ATTRBIT) ?  \
+        if (v & XFS_SB_VERSION_ATTRBIT)
-                                XFS_SB_VERSION_2 : \
+                return XFS_SB_VERSION_2;
-                                XFS_SB_VERSION_1)));
+        return XFS_SB_VERSION_1;
 }
 static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
 {
-        return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
+        return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
-                 ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+                sbp->sb_versionnum == XFS_SB_VERSION_3 ||
-                 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+                (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                  ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+                 (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
 }
 static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
 {
-        (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
+        if (sbp->sb_versionnum == XFS_SB_VERSION_1)
-                XFS_SB_VERSION_2 : \
+                sbp->sb_versionnum = XFS_SB_VERSION_2;
-                ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \
+        else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
-                        ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \
+                sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
-                        (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT)));
+        else
+                sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
 }
 static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
 {
-        return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
+        return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
-                 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+                 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                  ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+                  (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
 }
 static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
 {
-        (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
+        if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
-                XFS_SB_VERSION_3 : \
+                sbp->sb_versionnum = XFS_SB_VERSION_3;
-                ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT));
+        else
+                sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
 }
 static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
 }
 static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
 {
-        (sbp)->sb_versionnum = \
+        if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
-                 (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
+                sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
-                        ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
+        else
-                        (xfs_sb_version_tonew((sbp)->sb_versionnum) | \
+                sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
-                         XFS_SB_VERSION_QUOTABIT));
+                                        XFS_SB_VERSION_QUOTABIT;
 }
 static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
 }
 static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
 }
 static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
 }
 static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
 }
 static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
 }
 static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
 }
 static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
 }
 static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
                (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
 }
 static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 {
-        return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+        return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
-                ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+                (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
 }
 /*
@@ -463,22 +472,20 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
 static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
 {
-        return (xfs_sb_version_hasmorebits(sbp) &&      \
+        return xfs_sb_version_hasmorebits(sbp) &&
-                ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+                (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
 }
 static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
 {
-        return (xfs_sb_version_hasmorebits(sbp)) &&     \
+        return xfs_sb_version_hasmorebits(sbp) &&
-                ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+                (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
 }
 static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
 {
-        ((sbp)->sb_versionnum = \
+        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
-                ((sbp)->sb_versionnum | XFS_SB_VERSION_MOREBITSBIT),    \
+        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-        ((sbp)->sb_features2 =  \
-                ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT)));
 }
 static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 4e1c22a23be5..8570b826fedd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -290,7 +290,7 @@ xfs_trans_dup(
        ASSERT(tp->t_ticket != NULL);
        ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
-        ntp->t_ticket = tp->t_ticket;
+        ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
        ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
        tp->t_blk_res = tp->t_blk_res_used;
        ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
@@ -1260,6 +1260,13 @@ xfs_trans_roll(
        trans = *tpp;
        /*
+         * transaction commit worked ok so we can drop the extra ticket
+         * reference that we gained in xfs_trans_dup()
+         */
+        xfs_log_ticket_put(trans->t_ticket);
+        /*
         * Reserve space in the log for th next transaction.
         * This also pushes items in the "AIL", the list of logged items,
         * out to disk if they are taking up space at the tail of the log
@@ -1383,11 +1390,12 @@ xfs_trans_chunk_committed(
        xfs_log_item_desc_t     *lidp;
        xfs_log_item_t          *lip;
        xfs_lsn_t               item_lsn;
-        struct xfs_mount        *mp;
        int                     i;
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+                struct xfs_ail          *ailp;
                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
@@ -1424,19 +1432,19 @@ xfs_trans_chunk_committed(
                 * This would cause the earlier transaction to fail
                 * the test below.
                 */
-                mp = lip->li_mountp;
+                ailp = lip->li_ailp;
-                spin_lock(&mp->m_ail_lock);
+                spin_lock(&ailp->xa_lock);
                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
                        /*
                         * This will set the item's lsn to item_lsn
                         * and update the position of the item in
                         * the AIL.
                         *
-                         * xfs_trans_update_ail() drops the AIL lock.
+                         * xfs_trans_ail_update() drops the AIL lock.
                         */
-                        xfs_trans_update_ail(mp, lip, item_lsn);
+                        xfs_trans_ail_update(ailp, lip, item_lsn);
                } else {
-                        spin_unlock(&mp->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
                }
                /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 74c80bd2b0ec..d6fe4a88d79f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -18,6 +18,8 @@
 #ifndef __XFS_TRANS_H__
 #define __XFS_TRANS_H__
+struct xfs_log_item;
 /*
 * This is the structure written in the log at the head of
 * every transaction. It identifies the type and id of the
@@ -98,76 +100,6 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_TYPE_MAX              41
 /* new transaction types need to be reflected in xfs_logprint(8) */
-#ifdef __KERNEL__
-struct xfs_buf;
-struct xfs_buftarg;
-struct xfs_efd_log_item;
-struct xfs_efi_log_item;
-struct xfs_inode;
-struct xfs_item_ops;
-struct xfs_log_iovec;
-struct xfs_log_item;
-struct xfs_log_item_desc;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot_acct;
-typedef struct xfs_log_item {
-        struct list_head                li_ail;         /* AIL pointers */
-        xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
-        struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
-        struct xfs_mount                *li_mountp;     /* ptr to fs mount */
-        uint                            li_type;        /* item type */
-        uint                            li_flags;       /* misc flags */
-        struct xfs_log_item             *li_bio_list;   /* buffer item list */
-        void                            (*li_cb)(struct xfs_buf *,
-                                                 struct xfs_log_item *);
-                                                        /* buffer item iodone */
-                                                        /* callback func */
-        struct xfs_item_ops             *li_ops;        /* function list */
-} xfs_log_item_t;
-#define XFS_LI_IN_AIL   0x1
-#define XFS_LI_ABORTED  0x2
-typedef struct xfs_item_ops {
-        uint (*iop_size)(xfs_log_item_t *);
-        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
-        void (*iop_pin)(xfs_log_item_t *);
-        void (*iop_unpin)(xfs_log_item_t *, int);
-        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
-        uint (*iop_trylock)(xfs_log_item_t *);
-        void (*iop_unlock)(xfs_log_item_t *);
-        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
-        void (*iop_push)(xfs_log_item_t *);
-        void (*iop_pushbuf)(xfs_log_item_t *);
-        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
-#define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
-#define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
-#define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
-#define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
-#define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
-#define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_PUSH(ip)            (*(ip)->li_ops->iop_push)(ip)
-#define IOP_PUSHBUF(ip)         (*(ip)->li_ops->iop_pushbuf)(ip)
-#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
-/*
- * Return values for the IOP_TRYLOCK() routines.
- */
-#define XFS_ITEM_SUCCESS        0
-#define XFS_ITEM_PINNED         1
-#define XFS_ITEM_LOCKED         2
-#define XFS_ITEM_FLUSHING       3
-#define XFS_ITEM_PUSHBUF        4
-#endif  /* __KERNEL__ */
 /*
 * This structure is used to track log items associated with
 * a transaction.  It points to the log item and keeps some
@@ -176,7 +108,7 @@ typedef struct xfs_item_ops {
 * once we get to commit processing (see xfs_trans_commit()).
 */
 typedef struct xfs_log_item_desc {
-        xfs_log_item_t  *lid_item;
+        struct xfs_log_item     *lid_item;
        ushort          lid_size;
        unsigned char   lid_flags;
        unsigned char   lid_index;
@@ -276,94 +208,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
                (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
 }
-#ifdef __KERNEL__
-/*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-typedef struct xfs_log_busy_slot {
-        xfs_agnumber_t          lbc_ag;
-        ushort                  lbc_idx;        /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-#define XFS_LBC_NUM_SLOTS       31
-typedef struct xfs_log_busy_chunk {
-        struct xfs_log_busy_chunk       *lbc_next;
-        uint                            lbc_free;       /* free slots bitmask */
-        ushort                          lbc_unused;     /* first unused */
-        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
-#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
-#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
-#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-/*
- * This is the type of function which can be given to xfs_trans_callback()
- * to be called upon the transaction's commit to disk.
- */
-typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
-/*
- * This is the structure maintained for every active transaction.
- */
-typedef struct xfs_trans {
-        unsigned int            t_magic;        /* magic number */
-        xfs_log_callback_t      t_logcb;        /* log callback struct */
-        unsigned int            t_type;         /* transaction type */
-        unsigned int            t_log_res;      /* amt of log space resvd */
-        unsigned int            t_log_count;    /* count for perm log res */
-        unsigned int            t_blk_res;      /* # of blocks resvd */
-        unsigned int            t_blk_res_used; /* # of resvd blocks used */
-        unsigned int            t_rtx_res;      /* # of rt extents resvd */
-        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
-        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
-        xfs_lsn_t               t_lsn;          /* log seq num of start of
-                                                 * transaction. */
-        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
-                                                 * transaction. */
-        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
-        struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
-        xfs_trans_callback_t    t_callback;     /* transaction callback */
-        void                    *t_callarg;     /* callback arg */
-        unsigned int            t_flags;        /* misc flags */
-        int64_t                 t_icount_delta; /* superblock icount change */
-        int64_t                 t_ifree_delta;  /* superblock ifree change */
-        int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
-        int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
-        int64_t                 t_frextents_delta;/* superblock freextents chg*/
-        int64_t                 t_res_frextents_delta; /* on-disk only chg */
-#ifdef DEBUG
-        int64_t                 t_ag_freeblks_delta; /* debugging counter */
-        int64_t                 t_ag_flist_delta; /* debugging counter */
-        int64_t                 t_ag_btree_delta; /* debugging counter */
-#endif
-        int64_t                 t_dblocks_delta;/* superblock dblocks change */
-        int64_t                 t_agcount_delta;/* superblock agcount change */
-        int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
-        int64_t                 t_rextsize_delta;/* superblock rextsize chg */
-        int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
-        int64_t                 t_rblocks_delta;/* superblock rblocks change */
-        int64_t                 t_rextents_delta;/* superblocks rextents chg */
-        int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
-        unsigned int            t_items_free;   /* log item descs free */
-        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
-        xfs_trans_header_t      t_header;       /* header for in-log trans */
-        unsigned int            t_busy_free;    /* busy descs free */
-        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
-        unsigned long           t_pflags;       /* saved process flags state */
-} xfs_trans_t;
-#endif  /* __KERNEL__ */
 #define XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
 /*
 * Values for t_flags.
@@ -906,6 +750,157 @@ typedef struct xfs_trans {
 #define XFS_DQUOT_REF           1
 #ifdef __KERNEL__
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+typedef struct xfs_log_item {
+        struct list_head                li_ail;         /* AIL pointers */
+        xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
+        struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
+        struct xfs_mount                *li_mountp;     /* ptr to fs mount */
+        struct xfs_ail                  *li_ailp;       /* ptr to AIL */
+        uint                            li_type;        /* item type */
+        uint                            li_flags;       /* misc flags */
+        struct xfs_log_item             *li_bio_list;   /* buffer item list */
+        void                            (*li_cb)(struct xfs_buf *,
+                                                 struct xfs_log_item *);
+                                                        /* buffer item iodone */
+                                                        /* callback func */
+        struct xfs_item_ops             *li_ops;        /* function list */
+} xfs_log_item_t;
+#define XFS_LI_IN_AIL   0x1
+#define XFS_LI_ABORTED  0x2
+typedef struct xfs_item_ops {
+        uint (*iop_size)(xfs_log_item_t *);
+        void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+        void (*iop_pin)(xfs_log_item_t *);
+        void (*iop_unpin)(xfs_log_item_t *, int);
+        void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+        uint (*iop_trylock)(xfs_log_item_t *);
+        void (*iop_unlock)(xfs_log_item_t *);
+        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+        void (*iop_push)(xfs_log_item_t *);
+        void (*iop_pushbuf)(xfs_log_item_t *);
+        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+#define IOP_SIZE(ip)            (*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp)       (*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip)             (*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip, flags)    (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_TRYLOCK(ip)         (*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip)          (*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn)  (*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip)            (*(ip)->li_ops->iop_push)(ip)
+#define IOP_PUSHBUF(ip)         (*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define XFS_ITEM_SUCCESS        0
+#define XFS_ITEM_PINNED         1
+#define XFS_ITEM_LOCKED         2
+#define XFS_ITEM_FLUSHING       3
+#define XFS_ITEM_PUSHBUF        4
+/*
+ * This structure is used to maintain a list of block ranges that have been
+ * freed in the transaction.  The ranges are listed in the perag[] busy list
+ * between when they're freed and the transaction is committed to disk.
+ */
+typedef struct xfs_log_busy_slot {
+        xfs_agnumber_t          lbc_ag;
+        ushort                  lbc_idx;        /* index in perag.busy[] */
+} xfs_log_busy_slot_t;
+#define XFS_LBC_NUM_SLOTS       31
+typedef struct xfs_log_busy_chunk {
+        struct xfs_log_busy_chunk       *lbc_next;
+        uint                            lbc_free;       /* free slots bitmask */
+        ushort                          lbc_unused;     /* first unused */
+        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
+} xfs_log_busy_chunk_t;
+#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
+#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
+#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
+#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
+#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
+#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
+#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+        unsigned int            t_magic;        /* magic number */
+        xfs_log_callback_t      t_logcb;        /* log callback struct */
+        unsigned int            t_type;         /* transaction type */
+        unsigned int            t_log_res;      /* amt of log space resvd */
+        unsigned int            t_log_count;    /* count for perm log res */
+        unsigned int            t_blk_res;      /* # of blocks resvd */
+        unsigned int            t_blk_res_used; /* # of resvd blocks used */
+        unsigned int            t_rtx_res;      /* # of rt extents resvd */
+        unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
+        xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
+        xfs_lsn_t               t_lsn;          /* log seq num of start of
+                                                 * transaction. */
+        xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
+                                                 * transaction. */
+        struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
+        struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
+        xfs_trans_callback_t    t_callback;     /* transaction callback */
+        void                    *t_callarg;     /* callback arg */
+        unsigned int            t_flags;        /* misc flags */
+        int64_t                 t_icount_delta; /* superblock icount change */
+        int64_t                 t_ifree_delta;  /* superblock ifree change */
+        int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
+        int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
+        int64_t                 t_frextents_delta;/* superblock freextents chg*/
+        int64_t                 t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
+        int64_t                 t_ag_freeblks_delta; /* debugging counter */
+        int64_t                 t_ag_flist_delta; /* debugging counter */
+        int64_t                 t_ag_btree_delta; /* debugging counter */
+#endif
+        int64_t                 t_dblocks_delta;/* superblock dblocks change */
+        int64_t                 t_agcount_delta;/* superblock agcount change */
+        int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
+        int64_t                 t_rextsize_delta;/* superblock rextsize chg */
+        int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
+        int64_t                 t_rblocks_delta;/* superblock rblocks change */
+        int64_t                 t_rextents_delta;/* superblocks rextents chg */
+        int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
+        unsigned int            t_items_free;   /* log item descs free */
+        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
+        xfs_trans_header_t      t_header;       /* header for in-log trans */
+        unsigned int            t_busy_free;    /* busy descs free */
+        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
+        unsigned long           t_pflags;       /* saved process flags state */
+} xfs_trans_t;
 /*
 * XFS transaction mechanism exported interfaces that are
 * actually macros.
@@ -928,7 +923,6 @@ typedef struct xfs_trans {
 /*
 * XFS transaction mechanism exported interfaces.
 */
-void            xfs_trans_init(struct xfs_mount *);
 xfs_trans_t     *xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t     *_xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t     *xfs_trans_dup(xfs_trans_t *);
@@ -975,13 +969,8 @@ int		_xfs_trans_commit(xfs_trans_t *,
                                  int *);
 #define xfs_trans_commit(tp, flags)     _xfs_trans_commit(tp, flags, NULL)
 void            xfs_trans_cancel(xfs_trans_t *, int);
-int             xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
-void            xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
-xfs_lsn_t       xfs_trans_tail_ail(struct xfs_mount *);
-void            xfs_trans_unlocked_item(struct xfs_mount *,
-                                        xfs_log_item_t *);
 xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
                                        xfs_agnumber_t ag,
                                        xfs_extlen_t idx);
@@ -990,4 +979,7 @@ extern kmem_zone_t	*xfs_trans_zone;
 #endif  /* __KERNEL__ */
+void            xfs_trans_init(struct xfs_mount *);
+int             xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 #endif  /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1f77c00af566..2d47f10f8bed 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -1,5 +1,6 @@
 /*
 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
@@ -28,13 +29,13 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
+STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 #ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
 #else
 #define xfs_ail_check(a,l)
 #endif /* DEBUG */
@@ -50,20 +51,20 @@ STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
 * lsn of the last item in the AIL.
 */
 xfs_lsn_t
-xfs_trans_tail_ail(
+xfs_trans_ail_tail(
-        xfs_mount_t     *mp)
+        struct xfs_ail  *ailp)
 {
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
-        spin_lock(&mp->m_ail_lock);
+        spin_lock(&ailp->xa_lock);
-        lip = xfs_ail_min(&mp->m_ail);
+        lip = xfs_ail_min(ailp);
        if (lip == NULL) {
                lsn = (xfs_lsn_t)0;
        } else {
                lsn = lip->li_lsn;
        }
-        spin_unlock(&mp->m_ail_lock);
+        spin_unlock(&ailp->xa_lock);
        return lsn;
 }
@@ -85,16 +86,125 @@ xfs_trans_tail_ail(
 * any of the objects, so the lock is not needed.
 */
 void
-xfs_trans_push_ail(
+xfs_trans_ail_push(
-        xfs_mount_t             *mp,
+        struct xfs_ail  *ailp,
-        xfs_lsn_t               threshold_lsn)
+        xfs_lsn_t       threshold_lsn)
 {
-        xfs_log_item_t          *lip;
+        xfs_log_item_t  *lip;
+        lip = xfs_ail_min(ailp);
+        if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+                if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+                        xfsaild_wakeup(ailp, threshold_lsn);
+        }
+}
+/*
+ * AIL traversal cursor initialisation.
+ *
+ * The cursor keeps track of where our current traversal is up
+ * to by tracking the next ƣtem in the list for us. However, for
+ * this to be safe, removing an object from the AIL needs to invalidate
+ * any cursor that points to it. hence the traversal cursor needs to
+ * be linked to the struct xfs_ail so that deletion can search all the
+ * active cursors for invalidation.
+ *
+ * We don't link the push cursor because it is embedded in the struct
+ * xfs_ail and hence easily findable.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *cur)
+{
+        cur->item = NULL;
+        if (cur == &ailp->xa_cursors)
+                return;
+        cur->next = ailp->xa_cursors.next;
+        ailp->xa_cursors.next = cur;
+}
+/*
+ * Set the cursor to the next item, because when we look
+ * up the cursor the current item may have been freed.
+ */
+STATIC void
+xfs_trans_ail_cursor_set(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *cur,
+        struct xfs_log_item     *lip)
+{
+        if (lip)
+                cur->item = xfs_ail_next(ailp, lip);
+}
+/*
+ * Get the next item in the traversal and advance the cursor.
+ * If the cursor was invalidated (inidicated by a lip of 1),
+ * restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *cur)
+{
+        struct xfs_log_item     *lip = cur->item;
+        if ((__psint_t)lip & 1)
+                lip = xfs_ail_min(ailp);
+        xfs_trans_ail_cursor_set(ailp, cur, lip);
+        return lip;
+}
+/*
+ * Now that the traversal is complete, we need to remove the cursor
+ * from the list of traversing cursors. Avoid removing the embedded
+ * push cursor, but use the fact it is alway present to make the
+ * list deletion simple.
+ */
+void
+xfs_trans_ail_cursor_done(
+        struct xfs_ail          *ailp,
+        struct xfs_ail_cursor   *done)
+{
+        struct xfs_ail_cursor   *prev = NULL;
+        struct xfs_ail_cursor   *cur;
+        done->item = NULL;
+        if (done == &ailp->xa_cursors)
+                return;
+        prev = &ailp->xa_cursors;
+        for (cur = prev->next; cur; prev = cur, cur = prev->next) {
+                if (cur == done) {
+                        prev->next = cur->next;
+                        break;
+                }
+        }
+        ASSERT(cur);
+}
+/*
+ * Invalidate any cursor that is pointing to this item. This is
+ * called when an item is removed from the AIL. Any cursor pointing
+ * to this object is now invalid and the traversal needs to be
+ * terminated so it doesn't reference a freed object. We set the
+ * cursor item to a value of 1 so we can distinguish between an
+ * invalidation and the end of the list when getting the next item
+ * from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+        struct xfs_ail          *ailp,
+        struct xfs_log_item     *lip)
+{
+        struct xfs_ail_cursor   *cur;
-        lip = xfs_ail_min(&mp->m_ail);
+        /* need to search all cursors */
-        if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
+        for (cur = &ailp->xa_cursors; cur; cur = cur->next) {
-                if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
+                if (cur->item == lip)
-                        xfsaild_wakeup(mp, threshold_lsn);
+                        cur->item = (struct xfs_log_item *)
+                                        ((__psint_t)cur->item | 1);
        }
 }
@@ -103,25 +213,27 @@ xfs_trans_push_ail(
 * Return the current tree generation number for use
 * in calls to xfs_trans_next_ail().
 */
-STATIC xfs_log_item_t *
+xfs_log_item_t *
-xfs_trans_first_push_ail(
+xfs_trans_ail_cursor_first(
-        xfs_mount_t     *mp,
+        struct xfs_ail          *ailp,
-        int             *gen,
+        struct xfs_ail_cursor   *cur,
-        xfs_lsn_t       lsn)
+        xfs_lsn_t               lsn)
 {
-        xfs_log_item_t  *lip;
+        xfs_log_item_t          *lip;
-        lip = xfs_ail_min(&mp->m_ail);
+        xfs_trans_ail_cursor_init(ailp, cur);
-        *gen = (int)mp->m_ail.xa_gen;
+        lip = xfs_ail_min(ailp);
        if (lsn == 0)
-                return lip;
+                goto out;
-        list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
+        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
                if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
-                        return lip;
+                        goto out;
        }
+        lip = NULL;
-        return NULL;
+out:
+        xfs_trans_ail_cursor_set(ailp, cur, lip);
+        return lip;
 }
 /*
@@ -129,29 +241,29 @@ xfs_trans_first_push_ail(
 */
 long
 xfsaild_push(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
        xfs_lsn_t       *last_lsn)
 {
        long            tout = 1000; /* milliseconds */
        xfs_lsn_t       last_pushed_lsn = *last_lsn;
-        xfs_lsn_t       target =  mp->m_ail.xa_target;
+        xfs_lsn_t       target =  ailp->xa_target;
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
-        int             gen;
-        int             restarts;
        int             flush_log, count, stuck;
+        xfs_mount_t     *mp = ailp->xa_mount;
+        struct xfs_ail_cursor   *cur = &ailp->xa_cursors;
-#define XFS_TRANS_PUSH_AIL_RESTARTS     10
+        spin_lock(&ailp->xa_lock);
+        xfs_trans_ail_cursor_init(ailp, cur);
-        spin_lock(&mp->m_ail_lock);
+        lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
-        lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
                /*
                 * AIL is empty or our push has reached the end.
                 */
-                spin_unlock(&mp->m_ail_lock);
+                xfs_trans_ail_cursor_done(ailp, cur);
+                spin_unlock(&ailp->xa_lock);
                last_pushed_lsn = 0;
-                goto out;
+                return tout;
        }
        XFS_STATS_INC(xs_push_ail);
@@ -169,7 +281,7 @@ xfsaild_push(
         */
        tout = 10;
        lsn = lip->li_lsn;
-        flush_log = stuck = count = restarts = 0;
+        flush_log = stuck = count = 0;
        while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
                int     lock_result;
                /*
@@ -184,7 +296,7 @@ xfsaild_push(
                 * skip to the next item in the list.
                 */
                lock_result = IOP_TRYLOCK(lip);
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
                switch (lock_result) {
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
@@ -221,7 +333,7 @@ xfsaild_push(
                        break;
                }
-                spin_lock(&mp->m_ail_lock);
+                spin_lock(&ailp->xa_lock);
                /* should we bother continuing? */
                if (XFS_FORCED_SHUTDOWN(mp))
                        break;
@@ -244,14 +356,13 @@ xfsaild_push(
                if (stuck > 100)
                        break;
-                lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
+                lip = xfs_trans_ail_cursor_next(ailp, cur);
                if (lip == NULL)
                        break;
-                if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS)
-                        break;
                lsn = lip->li_lsn;
        }
-        spin_unlock(&mp->m_ail_lock);
+        xfs_trans_ail_cursor_done(ailp, cur);
+        spin_unlock(&ailp->xa_lock);
        if (flush_log) {
                /*
@@ -274,8 +385,7 @@ xfsaild_push(
                 */
                tout += 20;
                last_pushed_lsn = 0;
-        } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) ||
+        } else if ((stuck * 100) / count > 90) {
-                   ((stuck * 100) / count > 90)) {
                /*
                 * Either there is a lot of contention on the AIL or we
                 * are stuck due to operations in progress. "Stuck" in this
@@ -287,7 +397,6 @@ xfsaild_push(
                 */
                tout += 10;
        }
-out:
        *last_lsn = last_pushed_lsn;
        return tout;
 }       /* xfsaild_push */
@@ -303,7 +412,7 @@ out:
 */
 void
 xfs_trans_unlocked_item(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *min_lip;
@@ -315,7 +424,7 @@ xfs_trans_unlocked_item(
         * over some potentially valid data.
         */
        if (!(lip->li_flags & XFS_LI_IN_AIL) ||
-            XFS_FORCED_SHUTDOWN(mp)) {
+            XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
                return;
        }
@@ -331,10 +440,10 @@ xfs_trans_unlocked_item(
         * the call to xfs_log_move_tail() doesn't do anything if there's
         * not enough free space to wake people up so we're safe calling it.
         */
-        min_lip = xfs_ail_min(&mp->m_ail);
+        min_lip = xfs_ail_min(ailp);
        if (min_lip == lip)
-                xfs_log_move_tail(mp, 1);
+                xfs_log_move_tail(ailp->xa_mount, 1);
 }       /* xfs_trans_unlocked_item */
@@ -347,41 +456,37 @@ xfs_trans_unlocked_item(
 * we move in the AIL is the minimum one, update the tail lsn in the
 * log manager.
 *
- * Increment the AIL's generation count to indicate that the tree
- * has changed.
- *
 * This function must be called with the AIL lock held.  The lock
 * is dropped before returning.
 */
 void
-xfs_trans_update_ail(
+xfs_trans_ail_update(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip,
-        xfs_lsn_t       lsn) __releases(mp->m_ail_lock)
+        xfs_lsn_t       lsn) __releases(ailp->xa_lock)
 {
-        xfs_log_item_t          *dlip=NULL;
+        xfs_log_item_t          *dlip = NULL;
        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
-        mlip = xfs_ail_min(&mp->m_ail);
+        mlip = xfs_ail_min(ailp);
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                dlip = xfs_ail_delete(&mp->m_ail, lip);
+                dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+                xfs_trans_ail_cursor_clear(ailp, dlip);
        } else {
                lip->li_flags |= XFS_LI_IN_AIL;
        }
        lip->li_lsn = lsn;
+        xfs_ail_insert(ailp, lip);
-        xfs_ail_insert(&mp->m_ail, lip);
-        mp->m_ail.xa_gen++;
        if (mlip == dlip) {
-                mlip = xfs_ail_min(&mp->m_ail);
+                mlip = xfs_ail_min(ailp);
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
-                xfs_log_move_tail(mp, mlip->li_lsn);
+                xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn);
        } else {
-                spin_unlock(&mp->m_ail_lock);
+                spin_unlock(&ailp->xa_lock);
        }
@@ -403,29 +508,30 @@ xfs_trans_update_ail(
 * is dropped before returning.
 */
 void
-xfs_trans_delete_ail(
+xfs_trans_ail_delete(
-        xfs_mount_t     *mp,
+        struct xfs_ail  *ailp,
-        xfs_log_item_t  *lip) __releases(mp->m_ail_lock)
+        xfs_log_item_t  *lip) __releases(ailp->xa_lock)
 {
        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
        if (lip->li_flags & XFS_LI_IN_AIL) {
-                mlip = xfs_ail_min(&mp->m_ail);
+                mlip = xfs_ail_min(ailp);
-                dlip = xfs_ail_delete(&mp->m_ail, lip);
+                dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+                xfs_trans_ail_cursor_clear(ailp, dlip);
                lip->li_flags &= ~XFS_LI_IN_AIL;
                lip->li_lsn = 0;
-                mp->m_ail.xa_gen++;
                if (mlip == dlip) {
-                        mlip = xfs_ail_min(&mp->m_ail);
+                        mlip = xfs_ail_min(ailp);
-                        spin_unlock(&mp->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
-                        xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
+                        xfs_log_move_tail(ailp->xa_mount,
+                                                (mlip ? mlip->li_lsn : 0));
                } else {
-                        spin_unlock(&mp->m_ail_lock);
+                        spin_unlock(&ailp->xa_lock);
                }
        }
        else {
@@ -433,13 +539,13 @@ xfs_trans_delete_ail(
                 * If the file system is not being shutdown, we are in
                 * serious trouble if we get to this stage.
                 */
-                if (XFS_FORCED_SHUTDOWN(mp))
+                struct xfs_mount        *mp = ailp->xa_mount;
-                        spin_unlock(&mp->m_ail_lock);
-                else {
+                spin_unlock(&ailp->xa_lock);
+                if (!XFS_FORCED_SHUTDOWN(mp)) {
                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
                "%s: attempting to delete a log item that is not in the AIL",
                                        __func__);
-                        spin_unlock(&mp->m_ail_lock);
                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                }
        }
@@ -448,56 +554,6 @@ xfs_trans_delete_ail(
 /*
- * Return the item in the AIL with the smallest lsn.
- * Return the current tree generation number for use
- * in calls to xfs_trans_next_ail().
- */
-xfs_log_item_t *
-xfs_trans_first_ail(
-        xfs_mount_t     *mp,
-        int             *gen)
-{
-        xfs_log_item_t  *lip;
-        lip = xfs_ail_min(&mp->m_ail);
-        *gen = (int)mp->m_ail.xa_gen;
-        return lip;
-}
-/*
- * If the generation count of the tree has not changed since the
- * caller last took something from the AIL, then return the elmt
- * in the tree which follows the one given.  If the count has changed,
- * then return the minimum elmt of the AIL and bump the restarts counter
- * if one is given.
- */
-xfs_log_item_t *
-xfs_trans_next_ail(
-        xfs_mount_t     *mp,
-        xfs_log_item_t  *lip,
-        int             *gen,
-        int             *restarts)
-{
-        xfs_log_item_t  *nlip;
-        ASSERT(mp && lip && gen);
-        if (mp->m_ail.xa_gen == *gen) {
-                nlip = xfs_ail_next(&mp->m_ail, lip);
-        } else {
-                nlip = xfs_ail_min(&mp->m_ail);
-                *gen = (int)mp->m_ail.xa_gen;
-                if (restarts != NULL) {
-                        XFS_STATS_INC(xs_push_ail_restarts);
-                        (*restarts)++;
-                }
-        }
-        return (nlip);
-}
-/*
 * The active item list (AIL) is a doubly linked list of log
 * items sorted by ascending lsn.  The base of the list is
 * a forw/back pointer pair embedded in the xfs mount structure.
@@ -515,15 +571,35 @@ int
 xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
-        INIT_LIST_HEAD(&mp->m_ail.xa_ail);
+        struct xfs_ail  *ailp;
-        return xfsaild_start(mp);
+        int             error;
+        ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+        if (!ailp)
+                return ENOMEM;
+        ailp->xa_mount = mp;
+        INIT_LIST_HEAD(&ailp->xa_ail);
+        spin_lock_init(&ailp->xa_lock);
+        error = xfsaild_start(ailp);
+        if (error)
+                goto out_free_ailp;
+        mp->m_ail = ailp;
+        return 0;
+out_free_ailp:
+        kmem_free(ailp);
+        return error;
 }
 void
 xfs_trans_ail_destroy(
        xfs_mount_t     *mp)
 {
-        xfsaild_stop(mp);
+        struct xfs_ail  *ailp = mp->m_ail;
+        xfsaild_stop(ailp);
+        kmem_free(ailp);
 }
 /*
@@ -534,7 +610,7 @@ xfs_trans_ail_destroy(
 */
 STATIC void
 xfs_ail_insert(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -568,7 +644,7 @@ xfs_ail_insert(
 /*ARGSUSED*/
 STATIC xfs_log_item_t *
 xfs_ail_delete(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -585,7 +661,7 @@ xfs_ail_delete(
 */
 STATIC xfs_log_item_t *
 xfs_ail_min(
-        xfs_ail_t       *ailp)
+        struct xfs_ail  *ailp)
 /* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
@@ -601,7 +677,7 @@ xfs_ail_min(
 */
 STATIC xfs_log_item_t *
 xfs_ail_next(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -617,7 +693,7 @@ xfs_ail_next(
 */
 STATIC void
 xfs_ail_check(
-        xfs_ail_t       *ailp,
+        struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *prev_lip;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 4e855b5ced66..8ee2f8c8b0a6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -527,9 +527,8 @@ xfs_trans_brelse(xfs_trans_t	*tp,
                        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                        if (lip->li_type == XFS_LI_BUF) {
                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                                xfs_trans_unlocked_item(
+                                xfs_trans_unlocked_item(bip->bli_item.li_ailp,
-                                                bip->bli_item.li_mountp,
+                                                        lip);
-                                                lip);
                        }
                }
                xfs_buf_relse(bp);
@@ -626,7 +625,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
         * tell the AIL that the buffer is being unlocked.
         */
        if (bip != NULL) {
-                xfs_trans_unlocked_item(bip->bli_item.li_mountp,
+                xfs_trans_unlocked_item(bip->bli_item.li_ailp,
                                        (xfs_log_item_t*)bip);
        }
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 2a1c0f071f91..23d276af2e0c 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -85,7 +85,6 @@ xfs_trans_iget(
 {
        int                     error;
        xfs_inode_t             *ip;
-        xfs_inode_log_item_t    *iip;
        /*
         * If the transaction pointer is NULL, just call the normal
@@ -138,34 +137,7 @@ xfs_trans_iget(
        }
        ASSERT(ip != NULL);
-        /*
+        xfs_trans_ijoin(tp, ip, lock_flags);
-         * Get a log_item_desc to point at the new item.
-         */
-        if (ip->i_itemp == NULL)
-                xfs_inode_item_init(ip, mp);
-        iip = ip->i_itemp;
-        (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
-        xfs_trans_inode_broot_debug(ip);
-        /*
-         * If the IO lock has been acquired, mark that in
-         * the inode log item so we'll know to unlock it
-         * when the transaction commits.
-         */
-        ASSERT(iip->ili_flags == 0);
-        if (lock_flags & XFS_IOLOCK_EXCL) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
-        } else if (lock_flags & XFS_IOLOCK_SHARED) {
-                iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
-        }
-        /*
-         * Initialize i_transp so we can find it with xfs_inode_incore()
-         * above.
-         */
-        ip->i_transp = tp;
        *ipp = ip;
        return 0;
 }
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 3c666e8317f8..e110bf57d7f4 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,6 +22,14 @@
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
+/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+#include "xfs_bit.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
 STATIC int      xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
                                        int, int, xfs_lsn_t);
@@ -79,6 +87,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
                lidp->lid_size = 0;
                lip->li_desc = lidp;
                lip->li_mountp = tp->t_mountp;
+                lip->li_ailp = tp->t_mountp->m_ail;
                return lidp;
        }
@@ -120,6 +129,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
        lidp->lid_size = 0;
        lip->li_desc = lidp;
        lip->li_mountp = tp->t_mountp;
+        lip->li_ailp = tp->t_mountp->m_ail;
        return lidp;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 3c748c456ed4..73e2ad397432 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -44,25 +44,93 @@ xfs_log_busy_slot_t		*xfs_trans_add_busy(xfs_trans_t *tp,
                                                    xfs_extlen_t idx);
 /*
- * From xfs_trans_ail.c
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
 */
-void                    xfs_trans_update_ail(struct xfs_mount *mp,
+struct xfs_ail_cursor {
-                                     struct xfs_log_item *lip, xfs_lsn_t lsn)
+        struct xfs_ail_cursor   *next;
-                                     __releases(mp->m_ail_lock);
+        struct xfs_log_item     *item;
-void                    xfs_trans_delete_ail(struct xfs_mount *mp,
+};
-                                     struct xfs_log_item *lip)
-                                     __releases(mp->m_ail_lock);
-struct xfs_log_item     *xfs_trans_first_ail(struct xfs_mount *, int *);
-struct xfs_log_item     *xfs_trans_next_ail(struct xfs_mount *,
-                                     struct xfs_log_item *, int *, int *);
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+        struct xfs_mount        *xa_mount;
+        struct list_head        xa_ail;
+        uint                    xa_gen;
+        struct task_struct      *xa_task;
+        xfs_lsn_t               xa_target;
+        struct xfs_ail_cursor   xa_cursors;
+        spinlock_t              xa_lock;
+};
 /*
- * AIL push thread support
+ * From xfs_trans_ail.c
 */
-long    xfsaild_push(struct xfs_mount *, xfs_lsn_t *);
+void                    xfs_trans_ail_update(struct xfs_ail *ailp,
-void    xfsaild_wakeup(struct xfs_mount *, xfs_lsn_t);
+                                        struct xfs_log_item *lip, xfs_lsn_t lsn)
-int     xfsaild_start(struct xfs_mount *);
+                                        __releases(ailp->xa_lock);
-void    xfsaild_stop(struct xfs_mount *);
+void                    xfs_trans_ail_delete(struct xfs_ail *ailp,
+                                        struct xfs_log_item *lip)
+                                        __releases(ailp->xa_lock);
+void                    xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                    xfs_trans_unlocked_item(struct xfs_ail *,
+                                        xfs_log_item_t *);
+xfs_lsn_t               xfs_trans_ail_tail(struct xfs_ail *ailp);
+struct xfs_log_item     *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+                                        struct xfs_ail_cursor *cur,
+                                        xfs_lsn_t lsn);
+struct xfs_log_item     *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+                                        struct xfs_ail_cursor *cur);
+void                    xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
+                                        struct xfs_ail_cursor *cur);
+long    xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
+void    xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
+int     xfsaild_start(struct xfs_ail *);
+void    xfsaild_stop(struct xfs_ail *);
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       *dst,
+        xfs_lsn_t       *src)
+{
+        ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+        spin_lock(&ailp->xa_lock);
+        *dst = *src;
+        spin_unlock(&ailp->xa_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+        struct xfs_ail  *ailp,
+        xfs_lsn_t       *dst,
+        xfs_lsn_t       *src)
+{
+        ASSERT(sizeof(xfs_lsn_t) == 8);
+        *dst = *src;
+}
+#endif
 #endif  /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 35d4d414bcc2..fcc2285d03ed 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -172,6 +172,12 @@ xfs_dir_ialloc(
                        *ipp = NULL;
                        return code;
                }
+                /*
+                 * transaction commit worked ok so we can drop the extra ticket
+                 * reference that we gained in xfs_trans_dup()
+                 */
+                xfs_log_ticket_put(tp->t_ticket);
                code = xfs_trans_reserve(tp, 0, log_res, 0,
                                         XFS_TRANS_PERM_LOG_RES, log_count);
                /*
@@ -268,9 +274,9 @@ xfs_bump_ino_vers2(
        xfs_mount_t     *mp;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
+        ASSERT(ip->i_d.di_version == 1);
-        ip->i_d.di_version = XFS_DINODE_VERSION_2;
+        ip->i_d.di_version = 2;
        ip->i_d.di_onlink = 0;
        memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
        mp = tp->t_mountp;
@@ -302,7 +308,7 @@ xfs_bumplink(
        ASSERT(ip->i_d.di_nlink > 0);
        ip->i_d.di_nlink++;
        inc_nlink(VFS_I(ip));
-        if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
+        if ((ip->i_d.di_version == 1) &&
            (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
                /*
                 * The inode has increased its number of links beyond
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
deleted file mode 100644
index 439dd3939dda..000000000000
--- a/fs/xfs/xfs_vfsops.c
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_rw.h"
-#include "xfs_buf_item.h"
-#include "xfs_log_priv.h"
-#include "xfs_dir2_trace.h"
-#include "xfs_extfree_item.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_clnt.h"
-#include "xfs_mru_cache.h"
-#include "xfs_filestream.h"
-#include "xfs_fsops.h"
-#include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
-#include "xfs_utils.h"
-STATIC void
-xfs_quiesce_fs(
-        xfs_mount_t             *mp)
-{
-        int                     count = 0, pincount;
-        xfs_flush_buftarg(mp->m_ddev_targp, 0);
-        xfs_finish_reclaim_all(mp, 0);
-        /* This loop must run at least twice.
-         * The first instance of the loop will flush
-         * most meta data but that will generate more
-         * meta data (typically directory updates).
-         * Which then must be flushed and logged before
-         * we can write the unmount record.
-         */
-        do {
-                xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL);
-                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
-                if (!pincount) {
-                        delay(50);
-                        count++;
-                }
-        } while (count < 2);
-}
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
- */
-void
-xfs_attr_quiesce(
-        xfs_mount_t     *mp)
-{
-        int     error = 0;
-        /* wait for all modifications to complete */
-        while (atomic_read(&mp->m_active_trans) > 0)
-                delay(100);
-        /* flush inodes and push all remaining buffers out to disk */
-        xfs_quiesce_fs(mp);
-        ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
-        /* Push the superblock and write an unmount record */
-        error = xfs_log_sbcount(mp, 1);
-        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp,
-                                "xfs_attr_quiesce: failed to log sb changes. "
-                                "Frozen image may not be consistent.");
-        xfs_log_unmount_write(mp);
-        xfs_unmountfs_writesb(mp);
-}
-/*
- * xfs_unmount_flush implements a set of flush operation on special
- * inodes, which are needed as a separate set of operations so that
- * they can be called as part of relocation process.
- */
-int
-xfs_unmount_flush(
-        xfs_mount_t     *mp,            /* Mount structure we are getting
-                                           rid of. */
-        int             relocation)     /* Called from vfs relocation. */
-{
-        xfs_inode_t     *rip = mp->m_rootip;
-        xfs_inode_t     *rbmip;
-        xfs_inode_t     *rsumip = NULL;
-        int             error;
-        xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-        xfs_iflock(rip);
-        /*
-         * Flush out the real time inodes.
-         */
-        if ((rbmip = mp->m_rbmip) != NULL) {
-                xfs_ilock(rbmip, XFS_ILOCK_EXCL);
-                xfs_iflock(rbmip);
-                error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
-                xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
-                if (error == EFSCORRUPTED)
-                        goto fscorrupt_out;
-                ASSERT(vn_count(VFS_I(rbmip)) == 1);
-                rsumip = mp->m_rsumip;
-                xfs_ilock(rsumip, XFS_ILOCK_EXCL);
-                xfs_iflock(rsumip);
-                error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
-                xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
-                if (error == EFSCORRUPTED)
-                        goto fscorrupt_out;
-                ASSERT(vn_count(VFS_I(rsumip)) == 1);
-        }
-        /*
-         * Synchronously flush root inode to disk
-         */
-        error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
-        if (error == EFSCORRUPTED)
-                goto fscorrupt_out2;
-        if (vn_count(VFS_I(rip)) != 1 && !relocation) {
-                xfs_iunlock(rip, XFS_ILOCK_EXCL);
-                return XFS_ERROR(EBUSY);
-        }
-        /*
-         * Release dquot that rootinode, rbmino and rsumino might be holding,
-         * flush and purge the quota inodes.
-         */
-        error = XFS_QM_UNMOUNT(mp);
-        if (error == EFSCORRUPTED)
-                goto fscorrupt_out2;
-        if (rbmip) {
-                IRELE(rbmip);
-                IRELE(rsumip);
-        }
-        xfs_iunlock(rip, XFS_ILOCK_EXCL);
-        return 0;
-fscorrupt_out:
-        xfs_ifunlock(rip);
-fscorrupt_out2:
-        xfs_iunlock(rip, XFS_ILOCK_EXCL);
-        return XFS_ERROR(EFSCORRUPTED);
-}
-/*
- * xfs_sync flushes any pending I/O to file system vfsp.
- *
- * This routine is called by vfs_sync() to make sure that things make it
- * out to disk eventually, on sync() system calls to flush out everything,
- * and when the file system is unmounted.  For the vfs_sync() case, all
- * we really need to do is sync out the log to make all of our meta-data
- * updates permanent (except for timestamps).  For calls from pflushd(),
- * dirty pages are kept moving by calling pdflush() on the inodes
- * containing them.  We also flush the inodes that we can lock without
- * sleeping and the superblock if we can lock it without sleeping from
- * vfs_sync() so that items at the tail of the log are always moving out.
- *
- * Flags:
- *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
- *                     to sleep if we can help it.  All we really need
- *                     to do is ensure that the log is synced at least
- *                     periodically.  We also push the inodes and
- *                     superblock if we can lock them without sleeping
- *                      and they are not pinned.
- *      SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
- *                     set, then we really want to lock each inode and flush
- *                     it.
- *      SYNC_WAIT    - All the flushes that take place in this call should
- *                     be synchronous.
- *      SYNC_DELWRI  - This tells us to push dirty pages associated with
- *                     inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
- *                     determine if they should be flushed sync, async, or
- *                     delwri.
- *      SYNC_CLOSE   - This flag is passed when the system is being
- *                     unmounted.  We should sync and invalidate everything.
- *      SYNC_FSDATA  - This indicates that the caller would like to make
- *                     sure the superblock is safe on disk.  We can ensure
- *                     this by simply making sure the log gets flushed
- *                     if SYNC_BDFLUSH is set, and by actually writing it
- *                     out otherwise.
- *      SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
- *                     before we return (including direct I/O). Forms the drain
- *                     side of the write barrier needed to safely quiesce the
- *                     filesystem.
- *
- */
-int
-xfs_sync(
-        xfs_mount_t     *mp,
-        int             flags)
-{
-        int             error;
-        /*
-         * Get the Quota Manager to flush the dquots.
-         *
-         * If XFS quota support is not enabled or this filesystem
-         * instance does not use quotas XFS_QM_DQSYNC will always
-         * return zero.
-         */
-        error = XFS_QM_DQSYNC(mp, flags);
-        if (error) {
-                /*
-                 * If we got an IO error, we will be shutting down.
-                 * So, there's nothing more for us to do here.
-                 */
-                ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
-                if (XFS_FORCED_SHUTDOWN(mp))
-                        return XFS_ERROR(error);
-        }
-        if (flags & SYNC_IOWAIT)
-                xfs_filestream_flush(mp);
-        return xfs_syncsub(mp, flags, NULL);
-}
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_sync_inodes(
-        xfs_mount_t     *mp,
-        int             flags,
-        int             *bypassed)
-{
-        xfs_inode_t     *ip = NULL;
-        struct inode    *vp = NULL;
-        int             error;
-        int             last_error;
-        uint64_t        fflag;
-        uint            lock_flags;
-        uint            base_lock_flags;
-        boolean_t       mount_locked;
-        boolean_t       vnode_refed;
-        int             preempt;
-        xfs_iptr_t      *ipointer;
-#ifdef DEBUG
-        boolean_t       ipointer_in = B_FALSE;
-#define IPOINTER_SET    ipointer_in = B_TRUE
-#define IPOINTER_CLR    ipointer_in = B_FALSE
-#else
-#define IPOINTER_SET
-#define IPOINTER_CLR
-#endif
-/* Insert a marker record into the inode list after inode ip. The list
- * must be locked when this is called. After the call the list will no
- * longer be locked.
- */
-#define IPOINTER_INSERT(ip, mp) { \
-                ASSERT(ipointer_in == B_FALSE); \
-                ipointer->ip_mnext = ip->i_mnext; \
-                ipointer->ip_mprev = ip; \
-                ip->i_mnext = (xfs_inode_t *)ipointer; \
-                ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
-                preempt = 0; \
-                XFS_MOUNT_IUNLOCK(mp); \
-                mount_locked = B_FALSE; \
-                IPOINTER_SET; \
-        }
-/* Remove the marker from the inode list. If the marker was the only item
- * in the list then there are no remaining inodes and we should zero out
- * the whole list. If we are the current head of the list then move the head
- * past us.
- */
-#define IPOINTER_REMOVE(ip, mp) { \
-                ASSERT(ipointer_in == B_TRUE); \
-                if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
-                        ip = ipointer->ip_mnext; \
-                        ip->i_mprev = ipointer->ip_mprev; \
-                        ipointer->ip_mprev->i_mnext = ip; \
-                        if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
-                                mp->m_inodes = ip; \
-                        } \
-                } else { \
-                        ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
-                        mp->m_inodes = NULL; \
-                        ip = NULL; \
-                } \
-                IPOINTER_CLR; \
-        }
-#define XFS_PREEMPT_MASK        0x7f
-        ASSERT(!(flags & SYNC_BDFLUSH));
-        if (bypassed)
-                *bypassed = 0;
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
-                return 0;
-        error = 0;
-        last_error = 0;
-        preempt = 0;
-        /* Allocate a reference marker */
-        ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
-        fflag = XFS_B_ASYNC;            /* default is don't wait */
-        if (flags & SYNC_DELWRI)
-                fflag = XFS_B_DELWRI;
-        if (flags & SYNC_WAIT)
-                fflag = 0;              /* synchronous overrides all */
-        base_lock_flags = XFS_ILOCK_SHARED;
-        if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
-                /*
-                 * We need the I/O lock if we're going to call any of
-                 * the flush/inval routines.
-                 */
-                base_lock_flags |= XFS_IOLOCK_SHARED;
-        }
-        XFS_MOUNT_ILOCK(mp);
-        ip = mp->m_inodes;
-        mount_locked = B_TRUE;
-        vnode_refed  = B_FALSE;
-        IPOINTER_CLR;
-        do {
-                ASSERT(ipointer_in == B_FALSE);
-                ASSERT(vnode_refed == B_FALSE);
-                lock_flags = base_lock_flags;
-                /*
-                 * There were no inodes in the list, just break out
-                 * of the loop.
-                 */
-                if (ip == NULL) {
-                        break;
-                }
-                /*
-                 * We found another sync thread marker - skip it
-                 */
-                if (ip->i_mount == NULL) {
-                        ip = ip->i_mnext;
-                        continue;
-                }
-                vp = VFS_I(ip);
-                /*
-                 * If the vnode is gone then this is being torn down,
-                 * call reclaim if it is flushed, else let regular flush
-                 * code deal with it later in the loop.
-                 */
-                if (vp == NULL) {
-                        /* Skip ones already in reclaim */
-                        if (ip->i_flags & XFS_IRECLAIM) {
-                                ip = ip->i_mnext;
-                                continue;
-                        }
-                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                                ip = ip->i_mnext;
-                        } else if ((xfs_ipincount(ip) == 0) &&
-                                    xfs_iflock_nowait(ip)) {
-                                IPOINTER_INSERT(ip, mp);
-                                xfs_finish_reclaim(ip, 1,
-                                                XFS_IFLUSH_DELWRI_ELSE_ASYNC);
-                                XFS_MOUNT_ILOCK(mp);
-                                mount_locked = B_TRUE;
-                                IPOINTER_REMOVE(ip, mp);
-                        } else {
-                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                ip = ip->i_mnext;
-                        }
-                        continue;
-                }
-                if (VN_BAD(vp)) {
-                        ip = ip->i_mnext;
-                        continue;
-                }
-                if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
-                        XFS_MOUNT_IUNLOCK(mp);
-                        kmem_free(ipointer);
-                        return 0;
-                }
-                /*
-                 * Try to lock without sleeping.  We're out of order with
-                 * the inode list lock here, so if we fail we need to drop
-                 * the mount lock and try again.  If we're called from
-                 * bdflush() here, then don't bother.
-                 *
-                 * The inode lock here actually coordinates with the
-                 * almost spurious inode lock in xfs_ireclaim() to prevent
-                 * the vnode we handle here without a reference from
-                 * being freed while we reference it.  If we lock the inode
-                 * while it's on the mount list here, then the spurious inode
-                 * lock in xfs_ireclaim() after the inode is pulled from
-                 * the mount list will sleep until we release it here.
-                 * This keeps the vnode from being freed while we reference
-                 * it.
-                 */
-                if (xfs_ilock_nowait(ip, lock_flags) == 0) {
-                        if (vp == NULL) {
-                                ip = ip->i_mnext;
-                                continue;
-                        }
-                        vp = vn_grab(vp);
-                        if (vp == NULL) {
-                                ip = ip->i_mnext;
-                                continue;
-                        }
-                        IPOINTER_INSERT(ip, mp);
-                        xfs_ilock(ip, lock_flags);
-                        ASSERT(vp == VFS_I(ip));
-                        ASSERT(ip->i_mount == mp);
-                        vnode_refed = B_TRUE;
-                }
-                /* From here on in the loop we may have a marker record
-                 * in the inode list.
-                 */
-                /*
-                 * If we have to flush data or wait for I/O completion
-                 * we need to drop the ilock that we currently hold.
-                 * If we need to drop the lock, insert a marker if we
-                 * have not already done so.
-                 */
-                if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) ||
-                    ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) {
-                        if (mount_locked) {
-                                IPOINTER_INSERT(ip, mp);
-                        }
-                        xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                        if (flags & SYNC_CLOSE) {
-                                /* Shutdown case. Flush and invalidate. */
-                                if (XFS_FORCED_SHUTDOWN(mp))
-                                        xfs_tosspages(ip, 0, -1,
-                                                             FI_REMAPF);
-                                else
-                                        error = xfs_flushinval_pages(ip,
-                                                        0, -1, FI_REMAPF);
-                        } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
-                                error = xfs_flush_pages(ip, 0,
-                                                        -1, fflag, FI_NONE);
-                        }
-                        /*
-                         * When freezing, we need to wait ensure all I/O (including direct
-                         * I/O) is complete to ensure no further data modification can take
-                         * place after this point
-                         */
-                        if (flags & SYNC_IOWAIT)
-                                vn_iowait(ip);
-                        xfs_ilock(ip, XFS_ILOCK_SHARED);
-                }
-                if ((flags & SYNC_ATTR) &&
-                    (ip->i_update_core ||
-                     (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) {
-                        if (mount_locked)
-                                IPOINTER_INSERT(ip, mp);
-                        if (flags & SYNC_WAIT) {
-                                xfs_iflock(ip);
-                                error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-                        /*
-                         * If we can't acquire the flush lock, then the inode
-                         * is already being flushed so don't bother waiting.
-                         *
-                         * If we can lock it then do a delwri flush so we can
-                         * combine multiple inode flushes in each disk write.
-                         */
-                        } else if (xfs_iflock_nowait(ip)) {
-                                error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
-                        } else if (bypassed) {
-                                (*bypassed)++;
-                        }
-                }
-                if (lock_flags != 0) {
-                        xfs_iunlock(ip, lock_flags);
-                }
-                if (vnode_refed) {
-                        /*
-                         * If we had to take a reference on the vnode
-                         * above, then wait until after we've unlocked
-                         * the inode to release the reference.  This is
-                         * because we can be already holding the inode
-                         * lock when IRELE() calls xfs_inactive().
-                         *
-                         * Make sure to drop the mount lock before calling
-                         * IRELE() so that we don't trip over ourselves if
-                         * we have to go for the mount lock again in the
-                         * inactive code.
-                         */
-                        if (mount_locked) {
-                                IPOINTER_INSERT(ip, mp);
-                        }
-                        IRELE(ip);
-                        vnode_refed = B_FALSE;
-                }
-                if (error) {
-                        last_error = error;
-                }
-                /*
-                 * bail out if the filesystem is corrupted.
-                 */
-                if (error == EFSCORRUPTED)  {
-                        if (!mount_locked) {
-                                XFS_MOUNT_ILOCK(mp);
-                                IPOINTER_REMOVE(ip, mp);
-                        }
-                        XFS_MOUNT_IUNLOCK(mp);
-                        ASSERT(ipointer_in == B_FALSE);
-                        kmem_free(ipointer);
-                        return XFS_ERROR(error);
-                }
-                /* Let other threads have a chance at the mount lock
-                 * if we have looped many times without dropping the
-                 * lock.
-                 */
-                if ((++preempt & XFS_PREEMPT_MASK) == 0) {
-                        if (mount_locked) {
-                                IPOINTER_INSERT(ip, mp);
-                        }
-                }
-                if (mount_locked == B_FALSE) {
-                        XFS_MOUNT_ILOCK(mp);
-                        mount_locked = B_TRUE;
-                        IPOINTER_REMOVE(ip, mp);
-                        continue;
-                }
-                ASSERT(ipointer_in == B_FALSE);
-                ip = ip->i_mnext;
-        } while (ip != mp->m_inodes);
-        XFS_MOUNT_IUNLOCK(mp);
-        ASSERT(ipointer_in == B_FALSE);
-        kmem_free(ipointer);
-        return XFS_ERROR(last_error);
-}
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_syncsub(
-        xfs_mount_t     *mp,
-        int             flags,
-        int             *bypassed)
-{
-        int             error = 0;
-        int             last_error = 0;
-        uint            log_flags = XFS_LOG_FORCE;
-        xfs_buf_t       *bp;
-        xfs_buf_log_item_t      *bip;
-        /*
-         * Sync out the log.  This ensures that the log is periodically
-         * flushed even if there is not enough activity to fill it up.
-         */
-        if (flags & SYNC_WAIT)
-                log_flags |= XFS_LOG_SYNC;
-        xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-        if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
-                if (flags & SYNC_BDFLUSH)
-                        xfs_finish_reclaim_all(mp, 1);
-                else
-                        error = xfs_sync_inodes(mp, flags, bypassed);
-        }
-        /*
-         * Flushing out dirty data above probably generated more
-         * log activity, so if this isn't vfs_sync() then flush
-         * the log again.
-         */
-        if (flags & SYNC_DELWRI) {
-                xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-        }
-        if (flags & SYNC_FSDATA) {
-                /*
-                 * If this is vfs_sync() then only sync the superblock
-                 * if we can lock it without sleeping and it is not pinned.
-                 */
-                if (flags & SYNC_BDFLUSH) {
-                        bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
-                        if (bp != NULL) {
-                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                                if ((bip != NULL) &&
-                                    xfs_buf_item_dirty(bip)) {
-                                        if (!(XFS_BUF_ISPINNED(bp))) {
-                                                XFS_BUF_ASYNC(bp);
-                                                error = xfs_bwrite(mp, bp);
-                                        } else {
-                                                xfs_buf_relse(bp);
-                                        }
-                                } else {
-                                        xfs_buf_relse(bp);
-                                }
-                        }
-                } else {
-                        bp = xfs_getsb(mp, 0);
-                        /*
-                         * If the buffer is pinned then push on the log so
-                         * we won't get stuck waiting in the write for
-                         * someone, maybe ourselves, to flush the log.
-                         * Even though we just pushed the log above, we
-                         * did not have the superblock buffer locked at
-                         * that point so it can become pinned in between
-                         * there and here.
-                         */
-                        if (XFS_BUF_ISPINNED(bp))
-                                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-                        if (flags & SYNC_WAIT)
-                                XFS_BUF_UNASYNC(bp);
-                        else
-                                XFS_BUF_ASYNC(bp);
-                        error = xfs_bwrite(mp, bp);
-                }
-                if (error) {
-                        last_error = error;
-                }
-        }
-        /*
-         * Now check to see if the log needs a "dummy" transaction.
-         */
-        if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
-                xfs_trans_t *tp;
-                xfs_inode_t *ip;
-                /*
-                 * Put a dummy transaction in the log to tell
-                 * recovery that all others are OK.
-                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-                if ((error = xfs_trans_reserve(tp, 0,
-                                XFS_ICHANGE_LOG_RES(mp),
-                                0, 0, 0)))  {
-                        xfs_trans_cancel(tp, 0);
-                        return error;
-                }
-                ip = mp->m_rootip;
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-                xfs_trans_ihold(tp, ip);
-                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                error = xfs_trans_commit(tp, 0);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-        }
-        /*
-         * When shutting down, we need to insure that the AIL is pushed
-         * to disk or the filesystem can appear corrupt from the PROM.
-         */
-        if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
-                XFS_bflush(mp->m_ddev_targp);
-                if (mp->m_rtdev_targp) {
-                        XFS_bflush(mp->m_rtdev_targp);
-                }
-        }
-        return XFS_ERROR(last_error);
-}
diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h
deleted file mode 100644
index a74b05087da4..000000000000
--- a/fs/xfs/xfs_vfsops.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _XFS_VFSOPS_H
-#define _XFS_VFSOPS_H 1
-struct cred;
-struct xfs_fid;
-struct inode;
-struct kstatfs;
-struct xfs_mount;
-struct xfs_mount_args;
-int xfs_sync(struct xfs_mount *mp, int flags);
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
-                int lnnum);
-void xfs_attr_quiesce(struct xfs_mount *mp);
-#endif /* _XFS_VFSOPS_H */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8b6812f66a15..f07bf8768c3a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -54,33 +54,10 @@
 #include "xfs_vnodeops.h"
 int
-xfs_open(
-        xfs_inode_t     *ip)
-{
-        int             mode;
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-                return XFS_ERROR(EIO);
-        /*
-         * If it's a directory with any blocks, read-ahead block 0
-         * as we're almost certain to have the next operation be a read there.
-         */
-        if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
-                mode = xfs_ilock_map_shared(ip);
-                if (ip->i_d.di_nextents > 0)
-                        (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
-                xfs_iunlock(ip, mode);
-        }
-        return 0;
-}
-int
 xfs_setattr(
        struct xfs_inode        *ip,
        struct iattr            *iattr,
-        int                     flags,
+        int                     flags)
-        cred_t                  *credp)
 {
        xfs_mount_t             *mp = ip->i_mount;
        struct inode            *inode = VFS_I(ip);
@@ -93,7 +70,6 @@ xfs_setattr(
        gid_t                   gid=0, igid=0;
        int                     timeflags = 0;
        struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
-        int                     file_owner;
        int                     need_iolock = 1;
        xfs_itrace_entry(ip);
@@ -104,6 +80,10 @@ xfs_setattr(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
+        code = -inode_change_ok(inode, iattr);
+        if (code)
+                return code;
        olddquot1 = olddquot2 = NULL;
        udqp = gdqp = NULL;
@@ -181,62 +161,8 @@ xfs_setattr(
        xfs_ilock(ip, lock_flags);
-        /* boolean: are we the file owner? */
-        file_owner = (current_fsuid() == ip->i_d.di_uid);
-        /*
-         * Change various properties of a file.
-         * Only the owner or users with CAP_FOWNER
-         * capability may do these things.
-         */
-        if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
-                /*
-                 * CAP_FOWNER overrides the following restrictions:
-                 *
-                 * The user ID of the calling process must be equal
-                 * to the file owner ID, except in cases where the
-                 * CAP_FSETID capability is applicable.
-                 */
-                if (!file_owner && !capable(CAP_FOWNER)) {
-                        code = XFS_ERROR(EPERM);
-                        goto error_return;
-                }
-                /*
-                 * CAP_FSETID overrides the following restrictions:
-                 *
-                 * The effective user ID of the calling process shall match
-                 * the file owner when setting the set-user-ID and
-                 * set-group-ID bits on that file.
-                 *
-                 * The effective group ID or one of the supplementary group
-                 * IDs of the calling process shall match the group owner of
-                 * the file when setting the set-group-ID bit on that file
-                 */
-                if (mask & ATTR_MODE) {
-                        mode_t m = 0;
-                        if ((iattr->ia_mode & S_ISUID) && !file_owner)
-                                m |= S_ISUID;
-                        if ((iattr->ia_mode & S_ISGID) &&
-                            !in_group_p((gid_t)ip->i_d.di_gid))
-                                m |= S_ISGID;
-#if 0
-                        /* Linux allows this, Irix doesn't. */
-                        if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
-                                m |= S_ISVTX;
-#endif
-                        if (m && !capable(CAP_FSETID))
-                                iattr->ia_mode &= ~m;
-                }
-        }
        /*
         * Change file ownership.  Must be the owner or privileged.
-         * If the system was configured with the "restricted_chown"
-         * option, the owner is not permitted to give away the file,
-         * and can change the group id only to a group of which he
-         * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -251,23 +177,6 @@ xfs_setattr(
                uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
                /*
-                 * CAP_CHOWN overrides the following restrictions:
-                 *
-                 * If _POSIX_CHOWN_RESTRICTED is defined, this capability
-                 * shall override the restriction that a process cannot
-                 * change the user ID of a file it owns and the restriction
-                 * that the group ID supplied to the chown() function
-                 * shall be equal to either the group ID or one of the
-                 * supplementary group IDs of the calling process.
-                 */
-                if (restricted_chown &&
-                    (iuid != uid || (igid != gid &&
-                                     !in_group_p((gid_t)gid))) &&
-                    !capable(CAP_CHOWN)) {
-                        code = XFS_ERROR(EPERM);
-                        goto error_return;
-                }
-                /*
                 * Do a quota reservation only if uid/gid is actually
                 * going to change.
                 */
@@ -304,36 +213,22 @@ xfs_setattr(
                        code = XFS_ERROR(EINVAL);
                        goto error_return;
                }
                /*
                 * Make sure that the dquots are attached to the inode.
                 */
-                if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
+                code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+                if (code)
                        goto error_return;
-        }
-        /*
-         * Change file access or modified times.
-         */
-        if (mask & (ATTR_ATIME|ATTR_MTIME)) {
-                if (!file_owner) {
-                        if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
-                            !capable(CAP_FOWNER)) {
-                                code = XFS_ERROR(EPERM);
-                                goto error_return;
-                        }
-                }
-        }
-        /*
+                /*
-         * Now we can make the changes.  Before we join the inode
+                 * Now we can make the changes.  Before we join the inode
-         * to the transaction, if ATTR_SIZE is set then take care of
+                 * to the transaction, if ATTR_SIZE is set then take care of
-         * the part of the truncation that must be done without the
+                 * the part of the truncation that must be done without the
-         * inode lock.  This needs to be done before joining the inode
+                 * inode lock.  This needs to be done before joining the inode
-         * to the transaction, because the inode cannot be unlocked
+                 * to the transaction, because the inode cannot be unlocked
-         * once it is a part of the transaction.
+                 * once it is a part of the transaction.
-         */
+                 */
-        if (mask & ATTR_SIZE) {
-                code = 0;
                if (iattr->ia_size > ip->i_size) {
                        /*
                         * Do the first part of growing a file: zero any data
@@ -366,7 +261,7 @@ xfs_setattr(
                }
                /* wait for all I/O to complete */
-                vn_iowait(ip);
+                xfs_ioend_wait(ip);
                if (!code)
                        code = xfs_itruncate_data(ip, iattr->ia_size);
@@ -388,17 +283,10 @@ xfs_setattr(
                }
                commit_flags = XFS_TRANS_RELEASE_LOG_RES;
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-        }
-        if (tp) {
                xfs_trans_ijoin(tp, ip, lock_flags);
                xfs_trans_ihold(tp, ip);
-        }
-        /*
-         * Truncate file.  Must have write permission and not be a directory.
-         */
-        if (mask & ATTR_SIZE) {
                /*
                 * Only change the c/mtime if we are changing the size
                 * or we are explicitly asked to change it. This handles
@@ -438,28 +326,13 @@ xfs_setattr(
                         */
                        xfs_iflags_set(ip, XFS_ITRUNCATED);
                }
-        }
+        } else if (tp) {
+                xfs_trans_ijoin(tp, ip, lock_flags);
-        /*
+                xfs_trans_ihold(tp, ip);
-         * Change file access modes.
-         */
-        if (mask & ATTR_MODE) {
-                ip->i_d.di_mode &= S_IFMT;
-                ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
-                inode->i_mode &= S_IFMT;
-                inode->i_mode |= iattr->ia_mode & ~S_IFMT;
-                xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
-                timeflags |= XFS_ICHGTIME_CHG;
        }
        /*
         * Change file ownership.  Must be the owner or privileged.
-         * If the system was configured with the "restricted_chown"
-         * option, the owner is not permitted to give away the file,
-         * and can change the group id only to a group of which he
-         * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -503,6 +376,24 @@ xfs_setattr(
                timeflags |= XFS_ICHGTIME_CHG;
        }
+        /*
+         * Change file access modes.
+         */
+        if (mask & ATTR_MODE) {
+                umode_t mode = iattr->ia_mode;
+                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                        mode &= ~S_ISGID;
+                ip->i_d.di_mode &= S_IFMT;
+                ip->i_d.di_mode |= mode & ~S_IFMT;
+                inode->i_mode &= S_IFMT;
+                inode->i_mode |= mode & ~S_IFMT;
+                xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+                timeflags |= XFS_ICHGTIME_CHG;
+        }
        /*
         * Change file access or modified times.
@@ -713,7 +604,7 @@ xfs_fsync(
                return XFS_ERROR(EIO);
        /* capture size updates in I/O completion before writing the inode. */
-        error = filemap_fdatawait(VFS_I(ip)->i_mapping);
+        error = xfs_wait_on_pages(ip, 0, -1);
        if (error)
                return XFS_ERROR(error);
@@ -1029,6 +920,12 @@ xfs_inactive_symlink_rmt(
                goto error0;
        }
        /*
+         * transaction commit worked ok so we can drop the extra ticket
+         * reference that we gained in xfs_trans_dup()
+         */
+        xfs_log_ticket_put(tp->t_ticket);
+        /*
         * Remove the memory for extent descriptions (just bookkeeping).
         */
        if (ip->i_df.if_bytes)
@@ -1625,8 +1522,6 @@ xfs_create(
                xfs_trans_set_sync(tp);
        }
-        dp->i_gen++;
        /*
         * Attach the dquot(s) to the inodes and modify them incore.
         * These ids of the inode couldn't have changed since the new
@@ -1993,13 +1888,6 @@ xfs_remove(
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
-         * Bump the in memory generation count on the parent
-         * directory so that other can know that it has changed.
-         */
-        dp->i_gen++;
-        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        if (is_dir) {
                /*
                 * Drop the link from ip's "..".
@@ -2009,7 +1897,7 @@ xfs_remove(
                        goto out_bmap_cancel;
                /*
-                 * Drop the link from dp to ip.
+                 * Drop the "." link from ip to self.
                 */
                error = xfs_droplink(tp, ip);
                if (error)
@@ -2017,14 +1905,14 @@ xfs_remove(
        } else {
                /*
                 * When removing a non-directory we need to log the parent
-                 * inode here for the i_gen update.  For a directory this is
+                 * inode here.  For a directory this is done implicitly
-                 * done implicitly by the xfs_droplink call for the ".." entry.
+                 * by the xfs_droplink call for the ".." entry.
                 */
                xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        }
        /*
-         * Drop the "." link from ip to self.
+         * Drop the link from dp to ip.
         */
        error = xfs_droplink(tp, ip);
        if (error)
@@ -2178,7 +2066,6 @@ xfs_link(
        if (error)
                goto abort_return;
        xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        tdp->i_gen++;
        xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
        error = xfs_bumplink(tp, sip);
@@ -2355,18 +2242,10 @@ xfs_mkdir(
        }
        xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        /*
-         * Bump the in memory version number of the parent directory
-         * so that other processes accessing it will recognize that
-         * the directory has changed.
-         */
-        dp->i_gen++;
        error = xfs_dir_init(tp, cdp, dp);
        if (error)
                goto error2;
-        cdp->i_gen = 1;
        error = xfs_bumplink(tp, dp);
        if (error)
                goto error2;
@@ -2653,13 +2532,6 @@ xfs_symlink(
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
        /*
-         * Bump the in memory version number of the parent directory
-         * so that other processes accessing it will recognize that
-         * the directory has changed.
-         */
-        dp->i_gen++;
-        /*
         * If this is a synchronous mount, make sure that the
         * symlink transaction goes to disk before returning to
         * the user.
@@ -2809,7 +2681,7 @@ xfs_reclaim(
                return 0;
        }
-        vn_iowait(ip);
+        xfs_ioend_wait(ip);
        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -2833,122 +2705,10 @@ xfs_reclaim(
        if (!ip->i_update_core && (ip->i_itemp == NULL)) {
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_iflock(ip);
-                return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
+                xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-        } else {
+                return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
-                xfs_mount_t     *mp = ip->i_mount;
-                /* Protect sync and unpin from us */
-                XFS_MOUNT_ILOCK(mp);
-                spin_lock(&ip->i_flags_lock);
-                __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-                VFS_I(ip)->i_private = NULL;
-                ip->i_vnode = NULL;
-                spin_unlock(&ip->i_flags_lock);
-                list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
-                XFS_MOUNT_IUNLOCK(mp);
-        }
-        return 0;
-}
-int
-xfs_finish_reclaim(
-        xfs_inode_t     *ip,
-        int             locked,
-        int             sync_mode)
-{
-        xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-        struct inode    *vp = VFS_I(ip);
-        if (vp && VN_BAD(vp))
-                goto reclaim;
-        /* The hash lock here protects a thread in xfs_iget_core from
-         * racing with us on linking the inode back with a vnode.
-         * Once we have the XFS_IRECLAIM flag set it will not touch
-         * us.
-         */
-        write_lock(&pag->pag_ici_lock);
-        spin_lock(&ip->i_flags_lock);
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
-            (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
-                spin_unlock(&ip->i_flags_lock);
-                write_unlock(&pag->pag_ici_lock);
-                if (locked) {
-                        xfs_ifunlock(ip);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                }
-                return 1;
-        }
-        __xfs_iflags_set(ip, XFS_IRECLAIM);
-        spin_unlock(&ip->i_flags_lock);
-        write_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(ip->i_mount, pag);
-        /*
-         * If the inode is still dirty, then flush it out.  If the inode
-         * is not in the AIL, then it will be OK to flush it delwri as
-         * long as xfs_iflush() does not keep any references to the inode.
-         * We leave that decision up to xfs_iflush() since it has the
-         * knowledge of whether it's OK to simply do a delwri flush of
-         * the inode or whether we need to wait until the inode is
-         * pulled from the AIL.
-         * We get the flush lock regardless, though, just to make sure
-         * we don't free it while it is being flushed.
-         */
-        if (!locked) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                xfs_iflock(ip);
        }
+        xfs_inode_set_reclaim_tag(ip);
-        /*
-         * In the case of a forced shutdown we rely on xfs_iflush() to
-         * wait for the inode to be unpinned before returning an error.
-         */
-        if (xfs_iflush(ip, sync_mode) == 0) {
-                /* synchronize with xfs_iflush_done */
-                xfs_iflock(ip);
-                xfs_ifunlock(ip);
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
- reclaim:
-        xfs_ireclaim(ip);
-        return 0;
-}
-int
-xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
-{
-        int             purged;
-        xfs_inode_t     *ip, *n;
-        int             done = 0;
-        while (!done) {
-                purged = 0;
-                XFS_MOUNT_ILOCK(mp);
-                list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
-                        if (noblock) {
-                                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
-                                        continue;
-                                if (xfs_ipincount(ip) ||
-                                    !xfs_iflock_nowait(ip)) {
-                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                        continue;
-                                }
-                        }
-                        XFS_MOUNT_IUNLOCK(mp);
-                        if (xfs_finish_reclaim(ip, noblock,
-                                        XFS_IFLUSH_DELWRI_ELSE_ASYNC))
-                                delay(1);
-                        purged = 1;
-                        break;
-                }
-                done = !purged;
-        }
-        XFS_MOUNT_IUNLOCK(mp);
        return 0;
 }
@@ -3197,6 +2957,8 @@ xfs_zero_remaining_bytes(
        bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
                                XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp);
+        if (!bp)
+                return XFS_ERROR(ENOMEM);
        for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
                offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -3312,7 +3074,8 @@ xfs_free_file_space(
                need_iolock = 0;
        if (need_iolock) {
                xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                vn_iowait(ip);  /* wait for the completion of any pending DIOs */
+                /* wait for the completion of any pending DIOs */
+                xfs_ioend_wait(ip);
        }
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -3474,7 +3237,6 @@ xfs_change_file_space(
        int             cmd,
        xfs_flock64_t   *bf,
        xfs_off_t       offset,
-        cred_t          *credp,
        int             attr_flags)
 {
        xfs_mount_t     *mp = ip->i_mount;
@@ -3562,7 +3324,7 @@ xfs_change_file_space(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = startoffset;
-                error = xfs_setattr(ip, &iattr, attr_flags, credp);
+                error = xfs_setattr(ip, &iattr, attr_flags);
                if (error)
                        return error;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 7b0c2ab88333..76df328c61b4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -14,9 +14,7 @@ struct xfs_inode;
 struct xfs_iomap;
-int xfs_open(struct xfs_inode *ip);
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
-int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
-                cred_t *credp);
 #define XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
 #define XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK         0x04    /* Don't grab any conflicting locks */
@@ -44,8 +42,7 @@ int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
-                xfs_flock64_t *bf, xfs_off_t offset,
+                xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
-                cred_t *credp, int      attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
                struct xfs_inode *src_ip, struct xfs_inode *target_dp,
                struct xfs_name *target_name, struct xfs_inode *target_ip);
@@ -56,8 +53,6 @@ int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
 int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                int flags, struct attrlist_cursor_kern *cursor);
-int xfs_ioctl(struct xfs_inode *ip, struct file *filp,
-                int ioflags, unsigned int cmd, void __user *arg);
 ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
                const struct iovec *iovp, unsigned int segs,
                loff_t *offset, int ioflags);
@@ -78,5 +73,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
                xfs_off_t last, int fiopt);
 int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
                xfs_off_t last, uint64_t flags, int fiopt);
+int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
 #endif /* _XFS_VNODEOPS_H */
author	Ingo Molnar <mingo@elte.hu>	2009-01-10 20:42:53 -0500
committer	Ingo Molnar <mingo@elte.hu>	2009-01-10 20:42:53 -0500
commit	506c10f26c481b7f8ef27c1c79290f68989b2e9e (patch)
tree	03de82e812f00957aa6276dac2fe51c3358e88d7 /fs
parent	e1df957670aef74ffd9a4ad93e6d2c90bf6b4845 (diff)
parent	c59765042f53a79a7a65585042ff463b69cb248c (diff)