169 files changed, 11223 insertions, 4407 deletions
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index e0a85dbeeb88..a6665f37f456 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -53,6 +53,7 @@ struct adfs_dir_ops {
        int     (*update)(struct adfs_dir *dir, struct object_info *obj);
        int     (*create)(struct adfs_dir *dir, struct object_info *obj);
        int     (*remove)(struct adfs_dir *dir, struct object_info *obj);
+        int     (*sync)(struct adfs_dir *dir);
        void    (*free)(struct adfs_dir *dir);
 };
@@ -90,7 +91,8 @@ extern const struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
-extern int adfs_dir_update(struct super_block *sb, struct object_info *obj);
+extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
+                           int wait);
 /* file.c */
 extern const struct inode_operations adfs_file_inode_operations;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index e867ccf37246..4d4073447d1a 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -83,7 +83,7 @@ out:
 }
 int
-adfs_dir_update(struct super_block *sb, struct object_info *obj)
+adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
 {
        int ret = -EINVAL;
 #ifdef CONFIG_ADFS_FS_RW
@@ -106,6 +106,12 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj)
        ret = ops->update(&dir, obj);
        write_unlock(&adfs_dir_lock);
+        if (wait) {
+                int err = ops->sync(&dir);
+                if (!ret)
+                        ret = err;
+        }
        ops->free(&dir);
 out:
 #endif
@@ -199,7 +205,7 @@ const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = adfs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
 };
 static int
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index ea7df2146921..31df6adf0de6 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -437,6 +437,22 @@ bad_dir:
 #endif
 }
+static int
+adfs_f_sync(struct adfs_dir *dir)
+{
+        int err = 0;
+        int i;
+        for (i = dir->nr_buffers - 1; i >= 0; i--) {
+                struct buffer_head *bh = dir->bh[i];
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        err = -EIO;
+        }
+        return err;
+}
 static void
 adfs_f_free(struct adfs_dir *dir)
 {
@@ -456,5 +472,6 @@ struct adfs_dir_ops adfs_f_dir_ops = {
        .setpos         = adfs_f_setpos,
        .getnext        = adfs_f_getnext,
        .update         = adfs_f_update,
+        .sync           = adfs_f_sync,
        .free           = adfs_f_free
 };
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1ec644e32df9..139e0f345f18 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -161,6 +161,22 @@ out:
        return ret;
 }
+static int
+adfs_fplus_sync(struct adfs_dir *dir)
+{
+        int err = 0;
+        int i;
+        for (i = dir->nr_buffers - 1; i >= 0; i--) {
+                struct buffer_head *bh = dir->bh[i];
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        err = -EIO;
+        }
+        return err;
+}
 static void
 adfs_fplus_free(struct adfs_dir *dir)
 {
@@ -175,5 +191,6 @@ struct adfs_dir_ops adfs_fplus_dir_ops = {
        .read           = adfs_fplus_read,
        .setpos         = adfs_fplus_setpos,
        .getnext        = adfs_fplus_getnext,
+        .sync           = adfs_fplus_sync,
        .free           = adfs_fplus_free
 };
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 36e381c6a99a..8224d54a2afb 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -30,7 +30,7 @@ const struct file_operations adfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .mmap           = generic_file_mmap,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e647200262a2..05b3a677201d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -376,7 +376,7 @@ out:
 * The adfs-specific inode data has already been updated by
 * adfs_notify_change()
 */
-int adfs_write_inode(struct inode *inode, int unused)
+int adfs_write_inode(struct inode *inode, int wait)
 {
        struct super_block *sb = inode->i_sb;
        struct object_info obj;
@@ -391,7 +391,7 @@ int adfs_write_inode(struct inode *inode, int unused)
        obj.attr        = ADFS_I(inode)->attr;
        obj.size        = inode->i_size;
-        ret = adfs_dir_update(sb, &obj);
+        ret = adfs_dir_update(sb, &obj, wait);
        unlock_kernel();
        return ret;
 }
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 92ab4fbc2031..568081b93f73 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -62,7 +62,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
 #define GET_FRAG_ID(_map,_start,_idmask)                                \
        ({                                                              \
                unsigned char *_m = _map + (_start >> 3);               \
-                u32 _frag = get_unaligned((u32 *)_m);                   \
+                u32 _frag = get_unaligned_le32(_m);                     \
                _frag >>= (_start & 7);                                 \
                _frag & _idmask;                                        \
        })
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index dd9becca4241..0ec5aaf47aa7 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -132,11 +132,15 @@ static void adfs_put_super(struct super_block *sb)
        int i;
        struct adfs_sb_info *asb = ADFS_SB(sb);
+        lock_kernel();
        for (i = 0; i < asb->s_map_size; i++)
                brelse(asb->s_map[i].dm_bh);
        kfree(asb->s_map);
        kfree(asb);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 1a2d5e3c7f4e..e511dc621a2e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -182,6 +182,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 void            affs_free_prealloc(struct inode *inode);
 extern void     affs_truncate(struct inode *);
+int             affs_file_fsync(struct file *, struct dentry *, int);
 /* dir.c */
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 7b36904dbeac..8ca8f3a55599 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = affs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = affs_file_fsync,
 };
 /*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 9246cb4aa018..184e55c1c9ba 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = {
        .mmap           = generic_file_mmap,
        .open           = affs_file_open,
        .release        = affs_file_release,
-        .fsync          = file_fsync,
+        .fsync          = affs_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -915,3 +915,15 @@ affs_truncate(struct inode *inode)
        }
        affs_free_prealloc(inode);
 }
+int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+        struct inode * inode = dentry->d_inode;
+        int ret, err;
+        ret = write_inode_now(inode, 0);
+        err = sync_blockdev(inode->i_sb->s_bdev);
+        if (!ret)
+                ret = err;
+        return ret;
+}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 63f5183f263b..104fdcb3a7fc 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "affs.h"
 extern struct timezone sys_tz;
@@ -24,49 +25,67 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 static void
+affs_commit_super(struct super_block *sb, int clean)
+{
+        struct affs_sb_info *sbi = AFFS_SB(sb);
+        struct buffer_head *bh = sbi->s_root_bh;
+        struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
+        tail->bm_flag = cpu_to_be32(clean);
+        secs_to_datestamp(get_seconds(), &tail->disk_change);
+        affs_fix_checksum(sb, bh);
+        mark_buffer_dirty(bh);
+}
+static void
 affs_put_super(struct super_block *sb)
 {
        struct affs_sb_info *sbi = AFFS_SB(sb);
        pr_debug("AFFS: put_super()\n");
-        if (!(sb->s_flags & MS_RDONLY)) {
+        lock_kernel();
-                AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1);
-                secs_to_datestamp(get_seconds(),
+        if (!(sb->s_flags & MS_RDONLY))
-                                  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
+                affs_commit_super(sb, 1);
-                affs_fix_checksum(sb, sbi->s_root_bh);
-                mark_buffer_dirty(sbi->s_root_bh);
-        }
        kfree(sbi->s_prefix);
        affs_free_bitmap(sb);
        affs_brelse(sbi->s_root_bh);
        kfree(sbi);
        sb->s_fs_info = NULL;
-        return;
+        unlock_kernel();
 }
 static void
 affs_write_super(struct super_block *sb)
 {
        int clean = 2;
-        struct affs_sb_info *sbi = AFFS_SB(sb);
+        lock_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                //      if (sbi->s_bitmap[i].bm_bh) {
                //              if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
                //                      clean = 0;
-                AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean);
+                affs_commit_super(sb, clean);
-                secs_to_datestamp(get_seconds(),
-                                  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
-                affs_fix_checksum(sb, sbi->s_root_bh);
-                mark_buffer_dirty(sbi->s_root_bh);
                sb->s_dirt = !clean;    /* redo until bitmap synced */
        } else
                sb->s_dirt = 0;
+        unlock_super(sb);
        pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
 }
+static int
+affs_sync_fs(struct super_block *sb, int wait)
+{
+        lock_super(sb);
+        affs_commit_super(sb, 2);
+        sb->s_dirt = 0;
+        unlock_super(sb);
+        return 0;
+}
 static struct kmem_cache * affs_inode_cachep;
 static struct inode *affs_alloc_inode(struct super_block *sb)
@@ -124,6 +143,7 @@ static const struct super_operations affs_sops = {
        .clear_inode    = affs_clear_inode,
        .put_super      = affs_put_super,
        .write_super    = affs_write_super,
+        .sync_fs        = affs_sync_fs,
        .statfs         = affs_statfs,
        .remount_fs     = affs_remount,
        .show_options   = generic_show_options,
@@ -507,6 +527,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
                kfree(new_opts);
                return -EINVAL;
        }
+        lock_kernel();
        replace_mount_options(sb, new_opts);
        sbi->s_flags = mount_flags;
@@ -514,8 +535,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        sbi->s_uid   = uid;
        sbi->s_gid   = gid;
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+                unlock_kernel();
                return 0;
+        }
        if (*flags & MS_RDONLY) {
                sb->s_dirt = 1;
                while (sb->s_dirt)
@@ -524,6 +547,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        } else
                res = affs_init_bitmap(sb, flags);
+        unlock_kernel();
        return res;
 }
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2b9e2d03a390..c52be53f6946 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -244,7 +244,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
        case -EBUSY:
                /* someone else made a mount here whilst we were busy */
                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path.mnt, &nd->path.dentry))
+                       follow_down(&nd->path))
                        ;
                err = 0;
        default:
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 76828e5f8a39..ad0514d0115f 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -440,8 +440,12 @@ static void afs_put_super(struct super_block *sb)
        _enter("");
+        lock_kernel();
        afs_put_volume(as->volume);
+        unlock_kernel();
        _leave("");
 }
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 4eb4d8dfb2f1..2316e944a109 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -85,13 +85,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
                }
                path.mnt = mnt;
                path_get(&path);
-                if (!follow_down(&path.mnt, &path.dentry)) {
+                if (!follow_down(&path)) {
                        path_put(&path);
                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
                        continue;
                }
-                while (d_mountpoint(path.dentry) &&
+                while (d_mountpoint(path.dentry) && follow_down(&path));
-                       follow_down(&path.mnt, &path.dentry))
                        ;
                umount_ok = may_umount(path.mnt);
                path_put(&path);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b7ff33c63101..8f7cdde41733 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
-static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static inline int autofs4_follow_mount(struct path *path)
 {
        int res = 0;
-        while (d_mountpoint(*dentry)) {
+        while (d_mountpoint(path->dentry)) {
-                int followed = follow_down(mnt, dentry);
+                int followed = follow_down(path);
                if (!followed)
                        break;
                res = 1;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 84168c0dcc2d..f3da2eb51f56 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -192,77 +192,42 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
        return 0;
 }
-/*
+static int find_autofs_mount(const char *pathname,
- * Walk down the mount stack looking for an autofs mount that
+                             struct path *res,
- * has the requested device number (aka. new_encode_dev(sb->s_dev).
+                             int test(struct path *path, void *data),
- */
+                             void *data)
-static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
 {
-        struct dentry *dentry;
+        struct path path;
-        struct inode *inode;
+        int err = kern_path(pathname, 0, &path);
-        struct super_block *sb;
+        if (err)
-        dev_t s_dev;
+                return err;
-        unsigned int err;
        err = -ENOENT;
+        while (path.dentry == path.mnt->mnt_root) {
-        /* Lookup the dentry name at the base of our mount point */
+                if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
-        dentry = d_lookup(nd->path.dentry, &nd->last);
+                        if (test(&path, data)) {
-        if (!dentry)
+                                path_get(&path);
-                goto out;
+                                if (!err) /* already found some */
+                                        path_put(res);
-        dput(nd->path.dentry);
+                                *res = path;
-        nd->path.dentry = dentry;
-        /* And follow the mount stack looking for our autofs mount */
-        while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
-                inode = nd->path.dentry->d_inode;
-                if (!inode)
-                        break;
-                sb = inode->i_sb;
-                s_dev = new_encode_dev(sb->s_dev);
-                if (devno == s_dev) {
-                        if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
                                err = 0;
-                                break;
                        }
                }
+                if (!follow_up(&path))
+                        break;
        }
-out:
+        path_put(&path);
        return err;
 }
-/*
+static int test_by_dev(struct path *path, void *p)
- * Walk down the mount stack looking for an autofs mount that
- * has the requested mount type (ie. indirect, direct or offset).
- */
-static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
 {
-        struct dentry *dentry;
+        return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
-        struct autofs_info *ino;
+}
-        unsigned int err;
-        err = -ENOENT;
-        /* Lookup the dentry name at the base of our mount point */
-        dentry = d_lookup(nd->path.dentry, &nd->last);
-        if (!dentry)
-                goto out;
-        dput(nd->path.dentry);
-        nd->path.dentry = dentry;
-        /* And follow the mount stack looking for our autofs mount */
+static int test_by_type(struct path *path, void *p)
-        while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
+{
-                ino = autofs4_dentry_ino(nd->path.dentry);
+        struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
-                if (ino && ino->sbi->type & type) {
+        return ino && ino->sbi->type & *(unsigned *)p;
-                        err = 0;
-                        break;
-                }
-        }
-out:
-        return err;
 }
 static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
@@ -283,31 +248,25 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
 * Open a file descriptor on the autofs mount point corresponding
 * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
 */
-static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
+static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 {
-        struct file *filp;
-        struct nameidata nd;
        int err, fd;
        fd = get_unused_fd();
        if (likely(fd >= 0)) {
-                /* Get nameidata of the parent directory */
+                struct file *filp;
-                err = path_lookup(path, LOOKUP_PARENT, &nd);
+                struct path path;
+                err = find_autofs_mount(name, &path, test_by_dev, &devid);
                if (err)
                        goto out;
                /*
-                 * Search down, within the parent, looking for an
+                 * Find autofs super block that has the device number
-                 * autofs super block that has the device number
                 * corresponding to the autofs fs we want to open.
                 */
-                err = autofs_dev_ioctl_find_super(&nd, devid);
-                if (err) {
-                        path_put(&nd.path);
-                        goto out;
-                }
-                filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
+                filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
                                   current_cred());
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
@@ -340,7 +299,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
        param->ioctlfd = -1;
        path = param->path;
-        devid = param->openmount.devid;
+        devid = new_decode_dev(param->openmount.devid);
        err = 0;
        fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -475,8 +434,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                                      struct autofs_dev_ioctl *param)
 {
        struct autofs_info *ino;
-        struct nameidata nd;
+        struct path path;
-        const char *path;
        dev_t devid;
        int err = -ENOENT;
@@ -485,32 +443,24 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                goto out;
        }
-        path = param->path;
+        devid = sbi->sb->s_dev;
-        devid = new_encode_dev(sbi->sb->s_dev);
        param->requester.uid = param->requester.gid = -1;
-        /* Get nameidata of the parent directory */
+        err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
-        err = path_lookup(path, LOOKUP_PARENT, &nd);
        if (err)
                goto out;
-        err = autofs_dev_ioctl_find_super(&nd, devid);
+        ino = autofs4_dentry_ino(path.dentry);
-        if (err)
-                goto out_release;
-        ino = autofs4_dentry_ino(nd.path.dentry);
        if (ino) {
                err = 0;
-                autofs4_expire_wait(nd.path.dentry);
+                autofs4_expire_wait(path.dentry);
                spin_lock(&sbi->fs_lock);
                param->requester.uid = ino->uid;
                param->requester.gid = ino->gid;
                spin_unlock(&sbi->fs_lock);
        }
+        path_put(&path);
-out_release:
-        path_put(&nd.path);
 out:
        return err;
 }
@@ -569,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                                         struct autofs_sb_info *sbi,
                                         struct autofs_dev_ioctl *param)
 {
-        struct nameidata nd;
+        struct path path;
-        const char *path;
+        const char *name;
        unsigned int type;
        unsigned int devid, magic;
        int err = -ENOENT;
@@ -580,71 +530,46 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                goto out;
        }
-        path = param->path;
+        name = param->path;
        type = param->ismountpoint.in.type;
        param->ismountpoint.out.devid = devid = 0;
        param->ismountpoint.out.magic = magic = 0;
        if (!fp || param->ioctlfd == -1) {
-                if (autofs_type_any(type)) {
+                if (autofs_type_any(type))
-                        struct super_block *sb;
+                        err = kern_path(name, LOOKUP_FOLLOW, &path);
+                else
-                        err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+                        err = find_autofs_mount(name, &path, test_by_type, &type);
-                        if (err)
+                if (err)
-                                goto out;
+                        goto out;
+                devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
-                        sb = nd.path.dentry->d_sb;
-                        devid = new_encode_dev(sb->s_dev);
-                } else {
-                        struct autofs_info *ino;
-                        err = path_lookup(path, LOOKUP_PARENT, &nd);
-                        if (err)
-                                goto out;
-                        err = autofs_dev_ioctl_find_sbi_type(&nd, type);
-                        if (err)
-                                goto out_release;
-                        ino = autofs4_dentry_ino(nd.path.dentry);
-                        devid = autofs4_get_dev(ino->sbi);
-                }
                err = 0;
-                if (nd.path.dentry->d_inode &&
+                if (path.dentry->d_inode &&
-                    nd.path.mnt->mnt_root == nd.path.dentry) {
+                    path.mnt->mnt_root == path.dentry) {
                        err = 1;
-                        magic = nd.path.dentry->d_inode->i_sb->s_magic;
+                        magic = path.dentry->d_inode->i_sb->s_magic;
                }
        } else {
-                dev_t dev = autofs4_get_dev(sbi);
+                dev_t dev = sbi->sb->s_dev;
-                err = path_lookup(path, LOOKUP_PARENT, &nd);
+                err = find_autofs_mount(name, &path, test_by_dev, &dev);
                if (err)
                        goto out;
-                err = autofs_dev_ioctl_find_super(&nd, dev);
+                devid = new_encode_dev(dev);
-                if (err)
-                        goto out_release;
-                devid = dev;
-                err = have_submounts(nd.path.dentry);
+                err = have_submounts(path.dentry);
-                if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
+                if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
-                        if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
+                        if (follow_down(&path))
-                                struct inode *inode = nd.path.dentry->d_inode;
+                                magic = path.mnt->mnt_sb->s_magic;
-                                magic = inode->i_sb->s_magic;
-                        }
                }
        }
        param->ismountpoint.out.devid = devid;
        param->ismountpoint.out.magic = magic;
+        path_put(&path);
-out_release:
-        path_put(&nd.path);
 out:
        return err;
 }
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3077d8f16523..aa39ae83f019 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 {
        struct dentry *top = dentry;
+        struct path path = {.mnt = mnt, .dentry = dentry};
        int status = 1;
        DPRINTK("dentry %p %.*s",
                dentry, (int)dentry->d_name.len, dentry->d_name.name);
-        mntget(mnt);
+        path_get(&path);
-        dget(dentry);
-        if (!follow_down(&mnt, &dentry))
+        if (!follow_down(&path))
                goto done;
-        if (is_autofs4_dentry(dentry)) {
+        if (is_autofs4_dentry(path.dentry)) {
-                struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+                struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
                /* This is an autofs submount, we can't expire it */
                if (autofs_type_indirect(sbi->type))
@@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
                 * Otherwise it's an offset mount and we need to check
                 * if we can umount its mount, if there is one.
                 */
-                if (!d_mountpoint(dentry)) {
+                if (!d_mountpoint(path.dentry)) {
                        status = 0;
                        goto done;
                }
@@ -86,8 +86,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        status = 0;
 done:
        DPRINTK("returning = %d", status);
-        dput(dentry);
+        path_put(&path);
-        mntput(mnt);
        return status;
 }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e383bf0334f1..b96a3c57359d 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                nd->flags);
        /*
         * For an expire of a covered direct or offset mount we need
-         * to beeak out of follow_down() at the autofs mount trigger
+         * to break out of follow_down() at the autofs mount trigger
         * (d_mounted--), so we can see the expiring flag, and manage
         * the blocking and following here until the expire is completed.
         */
@@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                if (ino->flags & AUTOFS_INF_EXPIRING) {
                        spin_unlock(&sbi->fs_lock);
                        /* Follow down to our covering mount. */
-                        if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+                        if (!follow_down(&nd->path))
                                goto done;
                        goto follow;
                }
@@ -230,8 +230,7 @@ follow:
         * to follow it.
         */
        if (d_mountpoint(dentry)) {
-                if (!autofs4_follow_mount(&nd->path.mnt,
+                if (!autofs4_follow_mount(&nd->path)) {
-                                          &nd->path.dentry)) {
                        status = -ENOENT;
                        goto out_error;
                }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 76afd0d6b86c..9367b6297d84 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,6 +737,8 @@ parse_options(char *options, befs_mount_options * opts)
 static void
 befs_put_super(struct super_block *sb)
 {
+        lock_kernel();
        kfree(BEFS_SB(sb)->mount_opts.iocharset);
        BEFS_SB(sb)->mount_opts.iocharset = NULL;
@@ -747,7 +749,8 @@ befs_put_super(struct super_block *sb)
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        return;
+        unlock_kernel();
 }
 /* Allocate private field of the superblock, fill it.
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 4dd1b623f937..54bd07d44e68 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -79,7 +79,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = bfs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
        .llseek         = generic_file_llseek,
 };
@@ -205,7 +205,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
                inode->i_nlink = 1;
        }
        de->ino = 0;
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        mark_inode_dirty(dir);
        inode->i_ctime = dir->i_ctime;
@@ -267,7 +267,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_inode->i_ctime = CURRENT_TIME_SEC;
                inode_dec_link_count(new_inode);
        }
-        mark_buffer_dirty(old_bh);
+        mark_buffer_dirty_inode(old_bh, old_dir);
        error = 0;
 end_rename:
@@ -320,7 +320,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
                                for (i = 0; i < BFS_NAMELEN; i++)
                                        de->name[i] =
                                                (i < namelen) ? name[i] : 0;
-                                mark_buffer_dirty(bh);
+                                mark_buffer_dirty_inode(bh, dir);
                                brelse(bh);
                                return 0;
                        }
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index cc4062d12ca2..6f60336c6628 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,6 +30,7 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
+static void bfs_write_super(struct super_block *s);
 void dump_imap(const char *prefix, struct super_block *s);
 struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -97,14 +98,15 @@ error:
        return ERR_PTR(-EIO);
 }
-static int bfs_write_inode(struct inode *inode, int unused)
+static int bfs_write_inode(struct inode *inode, int wait)
 {
+        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
        unsigned int ino = (u16)inode->i_ino;
        unsigned long i_sblock;
        struct bfs_inode *di;
        struct buffer_head *bh;
        int block, off;
-        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+        int err = 0;
        dprintf("ino=%08x\n", ino);
@@ -145,9 +147,14 @@ static int bfs_write_inode(struct inode *inode, int unused)
        di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
        mark_buffer_dirty(bh);
+        if (wait) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        err = -EIO;
+        }
        brelse(bh);
        mutex_unlock(&info->bfs_lock);
-        return 0;
+        return err;
 }
 static void bfs_delete_inode(struct inode *inode)
@@ -209,6 +216,26 @@ static void bfs_delete_inode(struct inode *inode)
        clear_inode(inode);
 }
+static int bfs_sync_fs(struct super_block *sb, int wait)
+{
+        struct bfs_sb_info *info = BFS_SB(sb);
+        mutex_lock(&info->bfs_lock);
+        mark_buffer_dirty(info->si_sbh);
+        sb->s_dirt = 0;
+        mutex_unlock(&info->bfs_lock);
+        return 0;
+}
+static void bfs_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                bfs_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
+}
 static void bfs_put_super(struct super_block *s)
 {
        struct bfs_sb_info *info = BFS_SB(s);
@@ -216,11 +243,18 @@ static void bfs_put_super(struct super_block *s)
        if (!info)
                return;
+        lock_kernel();
+        if (s->s_dirt)
+                bfs_write_super(s);
        brelse(info->si_sbh);
        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
        kfree(info);
        s->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -240,17 +274,6 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static void bfs_write_super(struct super_block *s)
-{
-        struct bfs_sb_info *info = BFS_SB(s);
-        mutex_lock(&info->bfs_lock);
-        if (!(s->s_flags & MS_RDONLY))
-                mark_buffer_dirty(info->si_sbh);
-        s->s_dirt = 0;
-        mutex_unlock(&info->bfs_lock);
-}
 static struct kmem_cache *bfs_inode_cachep;
 static struct inode *bfs_alloc_inode(struct super_block *sb)
@@ -298,6 +321,7 @@ static const struct super_operations bfs_sops = {
        .delete_inode   = bfs_delete_inode,
        .put_super      = bfs_put_super,
        .write_super    = bfs_write_super,
+        .sync_fs        = bfs_sync_fs,
        .statfs         = bfs_statfs,
 };
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 931f6b8c4b2f..3a6d4fb2a329 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -176,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                                iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+        if (!bdev)
+                return 0;
+        if (!wait)
+                return filemap_flush(bdev->bd_inode->i_mapping);
+        return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
 /*
 * Write out and wait upon all the dirty data associated with a block
 * device via its mapping.  Does not take the superblock lock.
 */
 int sync_blockdev(struct block_device *bdev)
 {
-        int ret = 0;
+        return __sync_blockdev(bdev, 1);
-        if (bdev)
-                ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-        return ret;
 }
 EXPORT_SYMBOL(sync_blockdev);
@@ -199,7 +204,7 @@ int fsync_bdev(struct block_device *bdev)
 {
        struct super_block *sb = get_super(bdev);
        if (sb) {
-                int res = fsync_super(sb);
+                int res = sync_filesystem(sb);
                drop_super(sb);
                return res;
        }
@@ -241,7 +246,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
                sb->s_frozen = SB_FREEZE_WRITE;
                smp_wmb();
-                __fsync_super(sb);
+                sync_filesystem(sb);
                sb->s_frozen = SB_FREEZE_TRANS;
                smp_wmb();
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 94212844a9bc..a35eb36b32fd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+           export.o tree-log.o acl.o free-space-cache.o zlib.o \
-           compression.o delayed-ref.o
+           compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index cbba000dccbe..603972576f0f 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -351,9 +351,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
        return 0;
 }
-int btrfs_check_acl(struct inode *inode, int mask)
-{
-        return 0;
-}
 #endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 502c3d61de62..7f88628a1a72 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -294,10 +294,10 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
                atomic_set(&worker->num_pending, 0);
+                worker->workers = workers;
                worker->task = kthread_run(worker_loop, worker,
                                           "btrfs-%s-%d", workers->name,
                                           workers->num_workers + i);
-                worker->workers = workers;
                if (IS_ERR(worker->task)) {
                        kfree(worker);
                        ret = PTR_ERR(worker->task);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b30986f00b9d..acb4f3517582 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -72,6 +72,9 @@ struct btrfs_inode {
         */
        struct list_head ordered_operations;
+        /* node for the red-black tree that links inodes in subvolume root */
+        struct rb_node rb_node;
        /* the space_info for where this inode's data allocations are done */
        struct btrfs_space_info *space_info;
@@ -154,5 +157,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
        BTRFS_I(inode)->disk_i_size = size;
 }
 #endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ab07627084f1..de1e2fd32080 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode,
        u32 csum;
        u32 *cb_sum = &cb->sums;
-        if (btrfs_test_flag(inode, NODATASUM))
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                return 0;
        for (i = 0; i < cb->nr_pages; i++) {
@@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                         */
                        atomic_inc(&cb->pending_bios);
-                        if (!btrfs_test_flag(inode, NODATASUM)) {
+                        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
                                btrfs_lookup_bio_sums(root, inode, comp_bio,
                                                      sums);
                        }
@@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
        BUG_ON(ret);
-        if (!btrfs_test_flag(inode, NODATASUM))
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
                btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
deleted file mode 100644
index 6e1b3de36700..000000000000
--- a/fs/btrfs/crc32c.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef __BTRFS_CRC32C__
-#define __BTRFS_CRC32C__
-#include <linux/crc32c.h>
-/*
- * this file used to do more for selecting the HW version of crc32c,
- * perhaps it will one day again soon.
- */
-#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
-#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fedf8b9f03a2..60a45f3a4e91 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        u32 nritems;
        int ret = 0;
        int level;
-        struct btrfs_root *new_root;
+        struct btrfs_disk_key disk_key;
-        new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
-        if (!new_root)
-                return -ENOMEM;
-        memcpy(new_root, root, sizeof(*new_root));
-        new_root->root_key.objectid = new_root_objectid;
        WARN_ON(root->ref_cows && trans->transid !=
                root->fs_info->running_transaction->transid);
@@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        level = btrfs_header_level(buf);
        nritems = btrfs_header_nritems(buf);
+        if (level == 0)
+                btrfs_item_key(buf, &disk_key, 0);
+        else
+                btrfs_node_key(buf, &disk_key, 0);
-        cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
-                                     new_root_objectid, trans->transid,
+                                     new_root_objectid, &disk_key, level,
-                                     level, buf->start, 0);
+                                     buf->start, 0);
-        if (IS_ERR(cow)) {
+        if (IS_ERR(cow))
-                kfree(new_root);
                return PTR_ERR(cow);
-        }
        copy_extent_buffer(cow, buf, 0, 0, cow->len);
        btrfs_set_header_bytenr(cow, cow->start);
        btrfs_set_header_generation(cow, trans->transid);
-        btrfs_set_header_owner(cow, new_root_objectid);
+        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
-        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+                                     BTRFS_HEADER_FLAG_RELOC);
+        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+        else
+                btrfs_set_header_owner(cow, new_root_objectid);
        write_extent_buffer(cow, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
-        ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
+        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-        kfree(new_root);
+                ret = btrfs_inc_ref(trans, root, cow, 1);
+        else
+                ret = btrfs_inc_ref(trans, root, cow, 0);
        if (ret)
                return ret;
@@ -244,6 +246,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 }
 /*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+                              struct extent_buffer *buf)
+{
+        /*
+         * Tree blocks not in refernece counted trees and tree roots
+         * are never shared. If a block was allocated after the last
+         * snapshot and the block was not allocated by tree relocation,
+         * we know the block is not shared.
+         */
+        if (root->ref_cows &&
+            buf != root->node && buf != root->commit_root &&
+            (btrfs_header_generation(buf) <=
+             btrfs_root_last_snapshot(&root->root_item) ||
+             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+                return 1;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (root->ref_cows &&
+            btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+                return 1;
+#endif
+        return 0;
+}
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct extent_buffer *buf,
+                                       struct extent_buffer *cow)
+{
+        u64 refs;
+        u64 owner;
+        u64 flags;
+        u64 new_flags = 0;
+        int ret;
+        /*
+         * Backrefs update rules:
+         *
+         * Always use full backrefs for extent pointers in tree block
+         * allocated by tree relocation.
+         *
+         * If a shared tree block is no longer referenced by its owner
+         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+         * use full backrefs for extent pointers in tree block.
+         *
+         * If a tree block is been relocating
+         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+         * use full backrefs for extent pointers in tree block.
+         * The reason for this is some operations (such as drop tree)
+         * are only allowed for blocks use full backrefs.
+         */
+        if (btrfs_block_can_be_shared(root, buf)) {
+                ret = btrfs_lookup_extent_info(trans, root, buf->start,
+                                               buf->len, &refs, &flags);
+                BUG_ON(ret);
+                BUG_ON(refs == 0);
+        } else {
+                refs = 1;
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+                else
+                        flags = 0;
+        }
+        owner = btrfs_header_owner(buf);
+        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+        if (refs > 1) {
+                if ((owner == root->root_key.objectid ||
+                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+                        ret = btrfs_inc_ref(trans, root, buf, 1);
+                        BUG_ON(ret);
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID) {
+                                ret = btrfs_dec_ref(trans, root, buf, 0);
+                                BUG_ON(ret);
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                                BUG_ON(ret);
+                        }
+                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+                } else {
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID)
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                        else
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
+                        BUG_ON(ret);
+                }
+                if (new_flags != 0) {
+                        ret = btrfs_set_disk_extent_flags(trans, root,
+                                                          buf->start,
+                                                          buf->len,
+                                                          new_flags, 0);
+                        BUG_ON(ret);
+                }
+        } else {
+                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+                        if (root->root_key.objectid ==
+                            BTRFS_TREE_RELOC_OBJECTID)
+                                ret = btrfs_inc_ref(trans, root, cow, 1);
+                        else
+                                ret = btrfs_inc_ref(trans, root, cow, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_dec_ref(trans, root, buf, 1);
+                        BUG_ON(ret);
+                }
+                clean_tree_block(trans, root, buf);
+        }
+        return 0;
+}
+/*
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
@@ -262,34 +383,39 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                             struct extent_buffer **cow_ret,
                             u64 search_start, u64 empty_size)
 {
-        u64 parent_start;
+        struct btrfs_disk_key disk_key;
        struct extent_buffer *cow;
-        u32 nritems;
-        int ret = 0;
        int level;
        int unlock_orig = 0;
+        u64 parent_start;
        if (*cow_ret == buf)
                unlock_orig = 1;
        btrfs_assert_tree_locked(buf);
-        if (parent)
-                parent_start = parent->start;
-        else
-                parent_start = 0;
        WARN_ON(root->ref_cows && trans->transid !=
                root->fs_info->running_transaction->transid);
        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
        level = btrfs_header_level(buf);
-        nritems = btrfs_header_nritems(buf);
-        cow = btrfs_alloc_free_block(trans, root, buf->len,
+        if (level == 0)
-                                     parent_start, root->root_key.objectid,
+                btrfs_item_key(buf, &disk_key, 0);
-                                     trans->transid, level,
+        else
-                                     search_start, empty_size);
+                btrfs_node_key(buf, &disk_key, 0);
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent)
+                        parent_start = parent->start;
+                else
+                        parent_start = 0;
+        } else
+                parent_start = 0;
+        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
+                                     root->root_key.objectid, &disk_key,
+                                     level, search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
@@ -298,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        copy_extent_buffer(cow, buf, 0, 0, cow->len);
        btrfs_set_header_bytenr(cow, cow->start);
        btrfs_set_header_generation(cow, trans->transid);
-        btrfs_set_header_owner(cow, root->root_key.objectid);
+        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
-        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+                                     BTRFS_HEADER_FLAG_RELOC);
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+        else
+                btrfs_set_header_owner(cow, root->root_key.objectid);
        write_extent_buffer(cow, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
-        WARN_ON(btrfs_header_generation(buf) > trans->transid);
+        update_ref_for_cow(trans, root, buf, cow);
-        if (btrfs_header_generation(buf) != trans->transid) {
-                u32 nr_extents;
-                ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
-                if (ret)
-                        return ret;
-                ret = btrfs_cache_ref(trans, root, buf, nr_extents);
-                WARN_ON(ret);
-        } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
-                /*
-                 * There are only two places that can drop reference to
-                 * tree blocks owned by living reloc trees, one is here,
-                 * the other place is btrfs_drop_subtree. In both places,
-                 * we check reference count while tree block is locked.
-                 * Furthermore, if reference count is one, it won't get
-                 * increased by someone else.
-                 */
-                u32 refs;
-                ret = btrfs_lookup_extent_ref(trans, root, buf->start,
-                                              buf->len, &refs);
-                BUG_ON(ret);
-                if (refs == 1) {
-                        ret = btrfs_update_ref(trans, root, buf, cow,
-                                               0, nritems);
-                        clean_tree_block(trans, root, buf);
-                } else {
-                        ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
-                }
-                BUG_ON(ret);
-        } else {
-                ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
-                if (ret)
-                        return ret;
-                clean_tree_block(trans, root, buf);
-        }
-        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-                ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
-                WARN_ON(ret);
-        }
        if (buf == root->node) {
                WARN_ON(parent && parent != buf);
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+                        parent_start = buf->start;
+                else
+                        parent_start = 0;
                spin_lock(&root->node_lock);
                root->node = cow;
                extent_buffer_get(cow);
                spin_unlock(&root->node_lock);
-                if (buf != root->commit_root) {
+                btrfs_free_extent(trans, root, buf->start, buf->len,
-                        btrfs_free_extent(trans, root, buf->start,
+                                  parent_start, root->root_key.objectid,
-                                          buf->len, buf->start,
+                                  level, 0);
-                                          root->root_key.objectid,
-                                          btrfs_header_generation(buf),
-                                          level, 1);
-                }
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                        parent_start = parent->start;
+                else
+                        parent_start = 0;
+                WARN_ON(trans->transid != btrfs_header_generation(parent));
                btrfs_set_node_blockptr(parent, parent_slot,
                                        cow->start);
-                WARN_ON(trans->transid == 0);
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                WARN_ON(btrfs_header_generation(parent) != trans->transid);
                btrfs_free_extent(trans, root, buf->start, buf->len,
-                                  parent_start, btrfs_header_owner(parent),
+                                  parent_start, root->root_key.objectid,
-                                  btrfs_header_generation(parent), level, 1);
+                                  level, 0);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -384,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        return 0;
 }
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   struct extent_buffer *buf)
+{
+        if (btrfs_header_generation(buf) == trans->transid &&
+            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+                return 0;
+        return 1;
+}
 /*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
@@ -411,9 +519,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
                WARN_ON(1);
        }
-        if (btrfs_header_generation(buf) == trans->transid &&
+        if (!should_cow_block(trans, root, buf)) {
-            btrfs_header_owner(buf) == root->root_key.objectid &&
-            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                *cow_ret = buf;
                return 0;
        }
@@ -469,7 +575,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 /*
 * same as comp_keys only with two btrfs_key's
 */
-static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
 {
        if (k1->objectid > k2->objectid)
                return 1;
@@ -845,6 +951,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
        return -1;
 }
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+                     int level, int *slot)
+{
+        return bin_search(eb, key, level, slot);
+}
 /* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
@@ -921,13 +1033,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                root->node = child;
                spin_unlock(&root->node_lock);
-                ret = btrfs_update_extent_ref(trans, root, child->start,
-                                              child->len,
-                                              mid->start, child->start,
-                                              root->root_key.objectid,
-                                              trans->transid, level - 1);
-                BUG_ON(ret);
                add_root_to_dirty_list(root);
                btrfs_tree_unlock(child);
@@ -938,9 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                /* once for the path */
                free_extent_buffer(mid);
                ret = btrfs_free_extent(trans, root, mid->start, mid->len,
-                                        mid->start, root->root_key.objectid,
+                                        0, root->root_key.objectid, level, 1);
-                                        btrfs_header_generation(mid),
-                                        level, 1);
                /* once for the root ptr */
                free_extent_buffer(mid);
                return ret;
@@ -949,8 +1052,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
-        if (trans->transaction->delayed_refs.flushing &&
+        if (btrfs_header_nritems(mid) > 2)
-            btrfs_header_nritems(mid) > 2)
                return 0;
        if (btrfs_header_nritems(mid) < 2)
@@ -998,7 +1100,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
                        u64 bytenr = right->start;
-                        u64 generation = btrfs_header_generation(parent);
                        u32 blocksize = right->len;
                        clean_tree_block(trans, root, right);
@@ -1010,9 +1111,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        if (wret)
                                ret = wret;
                        wret = btrfs_free_extent(trans, root, bytenr,
-                                                 blocksize, parent->start,
+                                                 blocksize, 0,
-                                                 btrfs_header_owner(parent),
+                                                 root->root_key.objectid,
-                                                 generation, level, 1);
+                                                 level, 0);
                        if (wret)
                                ret = wret;
                } else {
@@ -1047,7 +1148,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        }
        if (btrfs_header_nritems(mid) == 0) {
                /* we've managed to empty the middle node, drop it */
-                u64 root_gen = btrfs_header_generation(parent);
                u64 bytenr = mid->start;
                u32 blocksize = mid->len;
@@ -1059,9 +1159,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret)
                        ret = wret;
                wret = btrfs_free_extent(trans, root, bytenr, blocksize,
-                                         parent->start,
+                                         0, root->root_key.objectid,
-                                         btrfs_header_owner(parent),
+                                         level, 0);
-                                         root_gen, level, 1);
                if (wret)
                        ret = wret;
        } else {
@@ -1437,7 +1536,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
        int i;
-        if (path->keep_locks || path->lowest_level)
+        if (path->keep_locks)
                return;
        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1552,7 +1651,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                }
                b = p->nodes[level];
        } else if (ins_len < 0 && btrfs_header_nritems(b) <
-                   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+                   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
                int sret;
                sret = reada_for_balance(root, p, level);
@@ -1614,10 +1713,17 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                lowest_unlock = 2;
 again:
-        if (p->skip_locking)
+        if (p->search_commit_root) {
-                b = btrfs_root_node(root);
+                b = root->commit_root;
-        else
+                extent_buffer_get(b);
-                b = btrfs_lock_root_node(root);
+                if (!p->skip_locking)
+                        btrfs_tree_lock(b);
+        } else {
+                if (p->skip_locking)
+                        b = btrfs_root_node(root);
+                else
+                        b = btrfs_lock_root_node(root);
+        }
        while (b) {
                level = btrfs_header_level(b);
@@ -1638,11 +1744,9 @@ again:
                         * then we don't want to set the path blocking,
                         * so we test it here
                         */
-                        if (btrfs_header_generation(b) == trans->transid &&
+                        if (!should_cow_block(trans, root, b))
-                            btrfs_header_owner(b) == root->root_key.objectid &&
-                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
                                goto cow_done;
-                        }
                        btrfs_set_path_blocking(p);
                        wret = btrfs_cow_block(trans, root, b,
@@ -1764,138 +1868,6 @@ done:
        return ret;
 }
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     struct btrfs_key *node_keys,
-                     u64 *nodes, int lowest_level)
-{
-        struct extent_buffer *eb;
-        struct extent_buffer *parent;
-        struct btrfs_key key;
-        u64 bytenr;
-        u64 generation;
-        u32 blocksize;
-        int level;
-        int slot;
-        int key_match;
-        int ret;
-        eb = btrfs_lock_root_node(root);
-        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
-        BUG_ON(ret);
-        btrfs_set_lock_blocking(eb);
-        parent = eb;
-        while (1) {
-                level = btrfs_header_level(parent);
-                if (level == 0 || level <= lowest_level)
-                        break;
-                ret = bin_search(parent, &node_keys[lowest_level], level,
-                                 &slot);
-                if (ret && slot > 0)
-                        slot--;
-                bytenr = btrfs_node_blockptr(parent, slot);
-                if (nodes[level - 1] == bytenr)
-                        break;
-                blocksize = btrfs_level_size(root, level - 1);
-                generation = btrfs_node_ptr_generation(parent, slot);
-                btrfs_node_key_to_cpu(eb, &key, slot);
-                key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
-                if (generation == trans->transid) {
-                        eb = read_tree_block(root, bytenr, blocksize,
-                                             generation);
-                        btrfs_tree_lock(eb);
-                        btrfs_set_lock_blocking(eb);
-                }
-                /*
-                 * if node keys match and node pointer hasn't been modified
-                 * in the running transaction, we can merge the path. for
-                 * blocks owened by reloc trees, the node pointer check is
-                 * skipped, this is because these blocks are fully controlled
-                 * by the space balance code, no one else can modify them.
-                 */
-                if (!nodes[level - 1] || !key_match ||
-                    (generation == trans->transid &&
-                     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
-                        if (level == 1 || level == lowest_level + 1) {
-                                if (generation == trans->transid) {
-                                        btrfs_tree_unlock(eb);
-                                        free_extent_buffer(eb);
-                                }
-                                break;
-                        }
-                        if (generation != trans->transid) {
-                                eb = read_tree_block(root, bytenr, blocksize,
-                                                generation);
-                                btrfs_tree_lock(eb);
-                                btrfs_set_lock_blocking(eb);
-                        }
-                        ret = btrfs_cow_block(trans, root, eb, parent, slot,
-                                              &eb);
-                        BUG_ON(ret);
-                        if (root->root_key.objectid ==
-                            BTRFS_TREE_RELOC_OBJECTID) {
-                                if (!nodes[level - 1]) {
-                                        nodes[level - 1] = eb->start;
-                                        memcpy(&node_keys[level - 1], &key,
-                                               sizeof(node_keys[0]));
-                                } else {
-                                        WARN_ON(1);
-                                }
-                        }
-                        btrfs_tree_unlock(parent);
-                        free_extent_buffer(parent);
-                        parent = eb;
-                        continue;
-                }
-                btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
-                btrfs_set_node_ptr_generation(parent, slot, trans->transid);
-                btrfs_mark_buffer_dirty(parent);
-                ret = btrfs_inc_extent_ref(trans, root,
-                                        nodes[level - 1],
-                                        blocksize, parent->start,
-                                        btrfs_header_owner(parent),
-                                        btrfs_header_generation(parent),
-                                        level - 1);
-                BUG_ON(ret);
-                /*
-                 * If the block was created in the running transaction,
-                 * it's possible this is the last reference to it, so we
-                 * should drop the subtree.
-                 */
-                if (generation == trans->transid) {
-                        ret = btrfs_drop_subtree(trans, root, eb, parent);
-                        BUG_ON(ret);
-                        btrfs_tree_unlock(eb);
-                        free_extent_buffer(eb);
-                } else {
-                        ret = btrfs_free_extent(trans, root, bytenr,
-                                        blocksize, parent->start,
-                                        btrfs_header_owner(parent),
-                                        btrfs_header_generation(parent),
-                                        level - 1, 1);
-                        BUG_ON(ret);
-                }
-                break;
-        }
-        btrfs_tree_unlock(parent);
-        free_extent_buffer(parent);
-        return 0;
-}
 /*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
@@ -2021,9 +1993,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(src);
        btrfs_mark_buffer_dirty(dst);
-        ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
-        BUG_ON(ret);
        return ret;
 }
@@ -2083,9 +2052,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(src);
        btrfs_mark_buffer_dirty(dst);
-        ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
-        BUG_ON(ret);
        return ret;
 }
@@ -2105,7 +2071,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        struct extent_buffer *c;
        struct extent_buffer *old;
        struct btrfs_disk_key lower_key;
-        int ret;
        BUG_ON(path->nodes[level]);
        BUG_ON(path->nodes[level-1] != root->node);
@@ -2117,16 +2082,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
                btrfs_node_key(lower, &lower_key, 0);
        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-                                   root->root_key.objectid, trans->transid,
+                                   root->root_key.objectid, &lower_key,
                                   level, root->node->start, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
-        memset_extent_buffer(c, 0, 0, root->nodesize);
+        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_nritems(c, 1);
        btrfs_set_header_level(c, level);
        btrfs_set_header_bytenr(c, c->start);
        btrfs_set_header_generation(c, trans->transid);
+        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(c, root->root_key.objectid);
        write_extent_buffer(c, root->fs_info->fsid,
@@ -2151,12 +2117,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        root->node = c;
        spin_unlock(&root->node_lock);
-        ret = btrfs_update_extent_ref(trans, root, lower->start,
-                                      lower->len, lower->start, c->start,
-                                      root->root_key.objectid,
-                                      trans->transid, level - 1);
-        BUG_ON(ret);
        /* the super has an extra ref to root->node */
        free_extent_buffer(old);
@@ -2233,7 +2193,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
-        } else if (!trans->transaction->delayed_refs.flushing) {
+        } else {
                ret = push_nodes_for_insert(trans, root, path, level);
                c = path->nodes[level];
                if (!ret && btrfs_header_nritems(c) <
@@ -2244,20 +2204,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        }
        c_nritems = btrfs_header_nritems(c);
+        mid = (c_nritems + 1) / 2;
+        btrfs_node_key(c, &disk_key, mid);
-        split = btrfs_alloc_free_block(trans, root, root->nodesize,
+        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-                                        path->nodes[level + 1]->start,
                                        root->root_key.objectid,
-                                        trans->transid, level, c->start, 0);
+                                        &disk_key, level, c->start, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
-        btrfs_set_header_flags(split, btrfs_header_flags(c));
+        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_level(split, btrfs_header_level(c));
        btrfs_set_header_bytenr(split, split->start);
        btrfs_set_header_generation(split, trans->transid);
+        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(split, root->root_key.objectid);
-        btrfs_set_header_flags(split, 0);
        write_extent_buffer(split, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(split),
                            BTRFS_FSID_SIZE);
@@ -2265,7 +2226,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                            (unsigned long)btrfs_header_chunk_tree_uuid(split),
                            BTRFS_UUID_SIZE);
-        mid = (c_nritems + 1) / 2;
        copy_extent_buffer(split, c,
                           btrfs_node_key_ptr_offset(0),
@@ -2278,16 +2238,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
        btrfs_mark_buffer_dirty(split);
-        btrfs_node_key(split, &disk_key, 0);
        wret = insert_ptr(trans, root, path, &disk_key, split->start,
                          path->slots[level + 1] + 1,
                          level + 1);
        if (wret)
                ret = wret;
-        ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
-        BUG_ON(ret);
        if (path->slots[level] >= mid) {
                path->slots[level] -= mid;
                btrfs_tree_unlock(c);
@@ -2360,7 +2316,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        u32 right_nritems;
        u32 data_end;
        u32 this_item_size;
-        int ret;
        if (empty)
                nr = 0;
@@ -2473,9 +2428,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
                btrfs_mark_buffer_dirty(left);
        btrfs_mark_buffer_dirty(right);
-        ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
-        BUG_ON(ret);
        btrfs_item_key(right, &disk_key, 0);
        btrfs_set_node_key(upper, &disk_key, slot + 1);
        btrfs_mark_buffer_dirty(upper);
@@ -2720,10 +2672,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        if (right_nritems)
                btrfs_mark_buffer_dirty(right);
-        ret = btrfs_update_ref(trans, root, right, left,
-                               old_left_nritems, push_items);
-        BUG_ON(ret);
        btrfs_item_key(right, &disk_key, 0);
        wret = fixup_low_keys(trans, root, path, &disk_key, 1);
        if (wret)
@@ -2880,9 +2828,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(l);
        BUG_ON(path->slots[0] != slot);
-        ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
-        BUG_ON(ret);
        if (mid <= slot) {
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
@@ -2911,6 +2856,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                               struct btrfs_path *path, int data_size,
                               int extend)
 {
+        struct btrfs_disk_key disk_key;
        struct extent_buffer *l;
        u32 nritems;
        int mid;
@@ -2918,12 +2864,11 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        struct extent_buffer *right;
        int ret = 0;
        int wret;
-        int double_split;
+        int split;
        int num_doubles = 0;
        /* first try to make some room by pushing left and right */
-        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
-            !trans->transaction->delayed_refs.flushing) {
                wret = push_leaf_right(trans, root, path, data_size, 0);
                if (wret < 0)
                        return wret;
@@ -2945,16 +2890,53 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
                        return ret;
        }
 again:
-        double_split = 0;
+        split = 1;
        l = path->nodes[0];
        slot = path->slots[0];
        nritems = btrfs_header_nritems(l);
        mid = (nritems + 1) / 2;
-        right = btrfs_alloc_free_block(trans, root, root->leafsize,
+        if (mid <= slot) {
-                                        path->nodes[1]->start,
+                if (nritems == 1 ||
+                    leaf_space_used(l, mid, nritems - mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (slot >= nritems) {
+                                split = 0;
+                        } else {
+                                mid = slot;
+                                if (mid != nritems &&
+                                    leaf_space_used(l, mid, nritems - mid) +
+                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        split = 2;
+                                }
+                        }
+                }
+        } else {
+                if (leaf_space_used(l, 0, mid) + data_size >
+                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (!extend && data_size && slot == 0) {
+                                split = 0;
+                        } else if ((extend || !data_size) && slot == 0) {
+                                mid = 1;
+                        } else {
+                                mid = slot;
+                                if (mid != nritems &&
+                                    leaf_space_used(l, mid, nritems - mid) +
+                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                                        split = 2 ;
+                                }
+                        }
+                }
+        }
+        if (split == 0)
+                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+        else
+                btrfs_item_key(l, &disk_key, mid);
+        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
-                                        trans->transid, 0, l->start, 0);
+                                        &disk_key, 0, l->start, 0);
        if (IS_ERR(right)) {
                BUG_ON(1);
                return PTR_ERR(right);
@@ -2963,6 +2945,7 @@ again:
        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(right, right->start);
        btrfs_set_header_generation(right, trans->transid);
+        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(right, root->root_key.objectid);
        btrfs_set_header_level(right, 0);
        write_extent_buffer(right, root->fs_info->fsid,
@@ -2973,79 +2956,47 @@ again:
                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
                            BTRFS_UUID_SIZE);
-        if (mid <= slot) {
+        if (split == 0) {
-                if (nritems == 1 ||
+                if (mid <= slot) {
-                    leaf_space_used(l, mid, nritems - mid) + data_size >
+                        btrfs_set_header_nritems(right, 0);
-                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        wret = insert_ptr(trans, root, path,
-                        if (slot >= nritems) {
+                                          &disk_key, right->start,
-                                struct btrfs_disk_key disk_key;
+                                          path->slots[1] + 1, 1);
+                        if (wret)
-                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
+                                ret = wret;
-                                btrfs_set_header_nritems(right, 0);
-                                wret = insert_ptr(trans, root, path,
-                                                  &disk_key, right->start,
-                                                  path->slots[1] + 1, 1);
-                                if (wret)
-                                        ret = wret;
-                                btrfs_tree_unlock(path->nodes[0]);
+                        btrfs_tree_unlock(path->nodes[0]);
-                                free_extent_buffer(path->nodes[0]);
+                        free_extent_buffer(path->nodes[0]);
-                                path->nodes[0] = right;
+                        path->nodes[0] = right;
-                                path->slots[0] = 0;
+                        path->slots[0] = 0;
-                                path->slots[1] += 1;
+                        path->slots[1] += 1;
-                                btrfs_mark_buffer_dirty(right);
+                } else {
-                                return ret;
+                        btrfs_set_header_nritems(right, 0);
-                        }
+                        wret = insert_ptr(trans, root, path,
-                        mid = slot;
+                                          &disk_key,
-                        if (mid != nritems &&
+                                          right->start,
-                            leaf_space_used(l, mid, nritems - mid) +
+                                          path->slots[1], 1);
-                            data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (wret)
-                                double_split = 1;
+                                ret = wret;
-                        }
+                        btrfs_tree_unlock(path->nodes[0]);
-                }
+                        free_extent_buffer(path->nodes[0]);
-        } else {
+                        path->nodes[0] = right;
-                if (leaf_space_used(l, 0, mid) + data_size >
+                        path->slots[0] = 0;
-                        BTRFS_LEAF_DATA_SIZE(root)) {
+                        if (path->slots[1] == 0) {
-                        if (!extend && data_size && slot == 0) {
+                                wret = fixup_low_keys(trans, root,
-                                struct btrfs_disk_key disk_key;
+                                                path, &disk_key, 1);
-                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
-                                btrfs_set_header_nritems(right, 0);
-                                wret = insert_ptr(trans, root, path,
-                                                  &disk_key,
-                                                  right->start,
-                                                  path->slots[1], 1);
                                if (wret)
                                        ret = wret;
-                                btrfs_tree_unlock(path->nodes[0]);
-                                free_extent_buffer(path->nodes[0]);
-                                path->nodes[0] = right;
-                                path->slots[0] = 0;
-                                if (path->slots[1] == 0) {
-                                        wret = fixup_low_keys(trans, root,
-                                                      path, &disk_key, 1);
-                                        if (wret)
-                                                ret = wret;
-                                }
-                                btrfs_mark_buffer_dirty(right);
-                                return ret;
-                        } else if ((extend || !data_size) && slot == 0) {
-                                mid = 1;
-                        } else {
-                                mid = slot;
-                                if (mid != nritems &&
-                                    leaf_space_used(l, mid, nritems - mid) +
-                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
-                                        double_split = 1;
-                                }
                        }
                }
+                btrfs_mark_buffer_dirty(right);
+                return ret;
        }
        ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
        BUG_ON(ret);
-        if (double_split) {
+        if (split == 2) {
                BUG_ON(num_doubles != 0);
                num_doubles++;
                goto again;
@@ -3447,7 +3398,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
                /* figure out how many keys we can insert in here */
                total_data = data_size[0];
                for (i = 1; i < nr; i++) {
-                        if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+                        if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
                                break;
                        total_data += data_size[i];
                }
@@ -3745,9 +3696,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 /*
 * a helper function to delete the leaf pointed to by path->slots[1] and
- * path->nodes[1].  bytenr is the node block pointer, but since the callers
+ * path->nodes[1].
- * already know it, it is faster to have them pass it down than to
- * read it out of the node again.
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
@@ -3755,15 +3704,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
-noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
+                                   struct btrfs_root *root,
-                            struct btrfs_path *path, u64 bytenr)
+                                   struct btrfs_path *path,
+                                   struct extent_buffer *leaf)
 {
        int ret;
-        u64 root_gen = btrfs_header_generation(path->nodes[1]);
-        u64 parent_start = path->nodes[1]->start;
-        u64 parent_owner = btrfs_header_owner(path->nodes[1]);
+        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
        ret = del_ptr(trans, root, path, 1, path->slots[1]);
        if (ret)
                return ret;
@@ -3774,10 +3722,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
         */
        btrfs_unlock_up_safe(path, 0);
-        ret = btrfs_free_extent(trans, root, bytenr,
+        ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
-                                btrfs_level_size(root, 0),
+                                0, root->root_key.objectid, 0, 0);
-                                parent_start, parent_owner,
-                                root_gen, 0, 1);
        return ret;
 }
 /*
@@ -3845,7 +3791,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                if (leaf == root->node) {
                        btrfs_set_header_level(leaf, 0);
                } else {
-                        ret = btrfs_del_leaf(trans, root, path, leaf->start);
+                        ret = btrfs_del_leaf(trans, root, path, leaf);
                        BUG_ON(ret);
                }
        } else {
@@ -3861,8 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
                /* delete the leaf if it is mostly empty */
-                if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
-                    !trans->transaction->delayed_refs.flushing) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -3884,8 +3829,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        if (btrfs_header_nritems(leaf) == 0) {
                                path->slots[1] = slot;
-                                ret = btrfs_del_leaf(trans, root, path,
+                                ret = btrfs_del_leaf(trans, root, path, leaf);
-                                                     leaf->start);
                                BUG_ON(ret);
                                free_extent_buffer(leaf);
                        } else {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4414a5d9983a..03441a99ea38 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,8 @@ struct btrfs_ordered_sum;
 #define BTRFS_MAX_LEVEL 8
+#define BTRFS_COMPAT_EXTENT_TREE_V0
 /*
 * files bigger than this get some pre-flushing when they are added
 * to the ordered operations list.  That way we limit the total
@@ -267,7 +269,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 }
 #define BTRFS_FSID_SIZE 16
-#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+#define BTRFS_HEADER_FLAG_WRITTEN       (1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC         (1ULL << 1)
+#define BTRFS_SUPER_FLAG_SEEDING        (1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP       (1ULL << 33)
+#define BTRFS_BACKREF_REV_MAX           256
+#define BTRFS_BACKREF_REV_SHIFT         56
+#define BTRFS_BACKREF_REV_MASK          (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+                                         BTRFS_BACKREF_REV_SHIFT)
+#define BTRFS_OLD_BACKREF_REV           0
+#define BTRFS_MIXED_BACKREF_REV         1
 /*
 * every tree block (leaf or node) starts with this header.
@@ -296,7 +309,6 @@ struct btrfs_header {
                                        sizeof(struct btrfs_item) - \
                                        sizeof(struct btrfs_file_extent_item))
-#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
 /*
 * this is a very generous portion of the super block, giving us
@@ -355,9 +367,12 @@ struct btrfs_super_block {
 * Compat flags that we support.  If any incompat flags are set other than the
 * ones specified below then we will fail to mount
 */
-#define BTRFS_FEATURE_COMPAT_SUPP       0x0
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF    (1ULL << 0)
-#define BTRFS_FEATURE_COMPAT_RO_SUPP    0x0
-#define BTRFS_FEATURE_INCOMPAT_SUPP     0x0
+#define BTRFS_FEATURE_COMPAT_SUPP               0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
+#define BTRFS_FEATURE_INCOMPAT_SUPP             \
+        BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -421,23 +436,65 @@ struct btrfs_path {
        unsigned int keep_locks:1;
        unsigned int skip_locking:1;
        unsigned int leave_spinning:1;
+        unsigned int search_commit_root:1;
 };
 /*
 * items in the extent btree are used to record the objectid of the
 * owner of the block and the number of references
 */
 struct btrfs_extent_item {
+        __le64 refs;
+        __le64 generation;
+        __le64 flags;
+} __attribute__ ((__packed__));
+struct btrfs_extent_item_v0 {
        __le32 refs;
 } __attribute__ ((__packed__));
-struct btrfs_extent_ref {
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+                                        sizeof(struct btrfs_item))
+#define BTRFS_EXTENT_FLAG_DATA          (1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK    (1ULL << 1)
+/* following flags only apply to tree blocks */
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF   (1ULL << 8)
+struct btrfs_tree_block_info {
+        struct btrfs_disk_key key;
+        u8 level;
+} __attribute__ ((__packed__));
+struct btrfs_extent_data_ref {
+        __le64 root;
+        __le64 objectid;
+        __le64 offset;
+        __le32 count;
+} __attribute__ ((__packed__));
+struct btrfs_shared_data_ref {
+        __le32 count;
+} __attribute__ ((__packed__));
+struct btrfs_extent_inline_ref {
+        u8 type;
+        u64 offset;
+} __attribute__ ((__packed__));
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
        __le64 root;
        __le64 generation;
        __le64 objectid;
-        __le32 num_refs;
+        __le32 count;
 } __attribute__ ((__packed__));
 /* dev extents record free space on individual devices.  The owner
 * field points back to the chunk allocation mapping tree that allocated
 * the extent.  The chunk tree uuid field is a way to double check the owner
@@ -695,12 +752,7 @@ struct btrfs_block_group_cache {
        struct list_head cluster_list;
 };
-struct btrfs_leaf_ref_tree {
+struct reloc_control;
-        struct rb_root root;
-        struct list_head list;
-        spinlock_t lock;
-};
 struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_fs_info {
@@ -831,18 +883,11 @@ struct btrfs_fs_info {
        struct task_struct *cleaner_kthread;
        int thread_pool_size;
-        /* tree relocation relocated fields */
-        struct list_head dead_reloc_roots;
-        struct btrfs_leaf_ref_tree reloc_ref_tree;
-        struct btrfs_leaf_ref_tree shared_ref_tree;
        struct kobject super_kobj;
        struct completion kobj_unregister;
        int do_barriers;
        int closing;
        int log_root_recovering;
-        atomic_t throttles;
-        atomic_t throttle_gen;
        u64 total_pinned;
@@ -861,6 +906,8 @@ struct btrfs_fs_info {
         */
        struct list_head space_info;
+        struct reloc_control *reloc_ctl;
        spinlock_t delalloc_lock;
        spinlock_t new_trans_lock;
        u64 delalloc_bytes;
@@ -891,7 +938,6 @@ struct btrfs_fs_info {
 * in ram representation of the tree.  extent_root is used for all allocations
 * and for the extent tree extent_root root.
 */
-struct btrfs_dirty_root;
 struct btrfs_root {
        struct extent_buffer *node;
@@ -899,9 +945,6 @@ struct btrfs_root {
        spinlock_t node_lock;
        struct extent_buffer *commit_root;
-        struct btrfs_leaf_ref_tree *ref_tree;
-        struct btrfs_leaf_ref_tree ref_tree_struct;
-        struct btrfs_dirty_root *dirty_root;
        struct btrfs_root *log_root;
        struct btrfs_root *reloc_root;
@@ -952,10 +995,15 @@ struct btrfs_root {
        /* the dirty list is only used by non-reference counted roots */
        struct list_head dirty_list;
+        struct list_head root_list;
        spinlock_t list_lock;
-        struct list_head dead_list;
        struct list_head orphan_list;
+        spinlock_t inode_lock;
+        /* red-black tree that keeps track of in-memory inodes */
+        struct rb_root inode_tree;
        /*
         * right now this just gets used so that a root has its own devid
         * for stat.  It may be used for more later
@@ -1017,7 +1065,16 @@ struct btrfs_root {
 * are used, and how many references there are to each block
 */
 #define BTRFS_EXTENT_ITEM_KEY   168
-#define BTRFS_EXTENT_REF_KEY    180
+#define BTRFS_TREE_BLOCK_REF_KEY        176
+#define BTRFS_EXTENT_DATA_REF_KEY       178
+#define BTRFS_EXTENT_REF_V0_KEY         180
+#define BTRFS_SHARED_BLOCK_REF_KEY      182
+#define BTRFS_SHARED_DATA_REF_KEY       184
 /*
 * block groups give us hints into the extent allocation trees.  Which
@@ -1043,6 +1100,8 @@ struct btrfs_root {
 #define BTRFS_MOUNT_COMPRESS            (1 << 5)
 #define BTRFS_MOUNT_NOTREELOG           (1 << 6)
 #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
+#define BTRFS_MOUNT_SSD_SPREAD          (1 << 8)
+#define BTRFS_MOUNT_NOSSD               (1 << 9)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1056,12 +1115,14 @@ struct btrfs_root {
 #define BTRFS_INODE_READONLY            (1 << 2)
 #define BTRFS_INODE_NOCOMPRESS          (1 << 3)
 #define BTRFS_INODE_PREALLOC            (1 << 4)
-#define btrfs_clear_flag(inode, flag)   (BTRFS_I(inode)->flags &= \
+#define BTRFS_INODE_SYNC                (1 << 5)
-                                         ~BTRFS_INODE_##flag)
+#define BTRFS_INODE_IMMUTABLE           (1 << 6)
-#define btrfs_set_flag(inode, flag)     (BTRFS_I(inode)->flags |= \
+#define BTRFS_INODE_APPEND              (1 << 7)
-                                         BTRFS_INODE_##flag)
+#define BTRFS_INODE_NODUMP              (1 << 8)
-#define btrfs_test_flag(inode, flag)    (BTRFS_I(inode)->flags & \
+#define BTRFS_INODE_NOATIME             (1 << 9)
-                                         BTRFS_INODE_##flag)
+#define BTRFS_INODE_DIRSYNC             (1 << 10)
 /* some macros to generate set/get funcs for the struct fields.  This
 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
 * one for u8:
@@ -1317,24 +1378,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
        return (u8 *)((unsigned long)dev + ptr);
 }
-/* struct btrfs_extent_ref */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
-BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
-BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
+                   generation, 64);
-BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
-BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
-                         generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
-                         objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
+static inline void btrfs_tree_block_key(struct extent_buffer *eb,
-                         num_refs, 32);
+                                        struct btrfs_tree_block_info *item,
+                                        struct btrfs_disk_key *key)
+{
+        read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+                                            struct btrfs_tree_block_info *item,
+                                            struct btrfs_disk_key *key)
+{
+        write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+                   root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+                   objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+                   offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+                   count, 32);
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+                   count, 32);
-/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
+                   type, 8);
-BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
-                         refs, 32);
+                   offset, 64);
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+        if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+            type == BTRFS_SHARED_BLOCK_REF_KEY)
+                return sizeof(struct btrfs_extent_inline_ref);
+        if (type == BTRFS_SHARED_DATA_REF_KEY)
+                return sizeof(struct btrfs_shared_data_ref) +
+                       sizeof(struct btrfs_extent_inline_ref);
+        if (type == BTRFS_EXTENT_DATA_REF_KEY)
+                return sizeof(struct btrfs_extent_data_ref) +
+                       offsetof(struct btrfs_extent_inline_ref, offset);
+        BUG();
+        return 0;
+}
+BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
+                   generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
@@ -1558,6 +1662,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
        return (flags & flag) == flag;
 }
+static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+{
+        u64 flags = btrfs_header_flags(eb);
+        return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+                                                int rev)
+{
+        u64 flags = btrfs_header_flags(eb);
+        flags &= ~BTRFS_BACKREF_REV_MASK;
+        flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+        btrfs_set_header_flags(eb, flags);
+}
 static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 {
        unsigned long ptr = offsetof(struct btrfs_header, fsid);
@@ -1790,39 +1909,32 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 objectid, u64 bytenr);
+                          struct btrfs_root *root,
+                          u64 objectid, u64 offset, u64 bytenr);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
                                                 u64 bytenr);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                             struct btrfs_root *root,
+                                        struct btrfs_root *root, u32 blocksize,
-                                             u32 blocksize, u64 parent,
+                                        u64 parent, u64 root_objectid,
-                                             u64 root_objectid,
+                                        struct btrfs_disk_key *key, int level,
-                                             u64 ref_generation,
+                                        u64 hint, u64 empty_size);
-                                             int level,
-                                             u64 hint,
-                                             u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
                                            int level);
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
+                                     struct btrfs_root *root,
-                       u64 num_bytes, u64 parent, u64 min_bytes,
+                                     u64 root_objectid, u64 owner,
-                       u64 root_objectid, u64 ref_generation,
+                                     u64 offset, struct btrfs_key *ins);
-                       u64 owner, u64 empty_size, u64 hint_byte,
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
-                       u64 search_end, struct btrfs_key *ins, u64 data);
+                                   struct btrfs_root *root,
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                   u64 root_objectid, u64 owner, u64 offset,
-                                struct btrfs_root *root, u64 parent,
+                                   struct btrfs_key *ins);
-                                u64 root_objectid, u64 ref_generation,
-                                u64 owner, struct btrfs_key *ins);
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, u64 parent,
-                                u64 root_objectid, u64 ref_generation,
-                                u64 owner, struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  u64 num_bytes, u64 min_alloc_size,
@@ -1830,18 +1942,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  u64 search_end, struct btrfs_key *ins,
                                  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+                  struct extent_buffer *buf, int full_backref);
-                  u32 *nr_extents);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                  struct extent_buffer *buf, int full_backref);
-                    struct extent_buffer *buf, u32 nr_extents);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
-                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                                u64 bytenr, u64 num_bytes, u64 flags,
-                     struct extent_buffer *buf, int start_slot, int nr);
+                                int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
-                      u64 root_objectid, u64 ref_generation,
+                      u64 root_objectid, u64 owner, u64 offset);
-                      u64 owner_objectid, int pin);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
@@ -1849,13 +1961,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                         u64 root_objectid, u64 ref_generation,
+                         u64 root_objectid, u64 owner, u64 offset);
-                         u64 owner_objectid);
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr, u64 num_bytes,
-                            u64 orig_parent, u64 parent,
-                            u64 root_objectid, u64 ref_generation,
-                            u64 owner_objectid);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
@@ -1867,16 +1974,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+                                struct btrfs_block_group_cache *group);
-                          struct btrfs_root *root);
-int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
-int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct extent_buffer *buf, u64 orig_start);
-int btrfs_add_dead_reloc_root(struct btrfs_root *root);
-int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -1891,13 +1991,12 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
                              u64 bytes);
 /* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+                     int level, int *slot);
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid,
                        int type);
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     struct btrfs_key *node_keys,
-                     u64 *nodes, int lowest_level);
 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, struct btrfs_path *path,
                            struct btrfs_key *new_key);
@@ -1918,6 +2017,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct extent_buffer *buf,
                      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+                              struct extent_buffer *buf);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1944,9 +2045,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int slot, int nr);
-int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            struct btrfs_path *path, u64 bytenr);
 static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_path *path)
@@ -2005,8 +2103,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
                         btrfs_root_item *item, struct btrfs_key *key);
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
                      u64 *found_objectid);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
-                          struct btrfs_root *latest_root);
+int btrfs_set_root_node(struct btrfs_root_item *item,
+                        struct extent_buffer *node);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, const char *name,
@@ -2139,7 +2238,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
-void btrfs_read_locked_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -2147,12 +2245,8 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
-                            struct btrfs_root *root, int wait);
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-                                struct btrfs_root *root);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-                         struct btrfs_root *root, int *is_new);
+                         struct btrfs_root *root);
 int btrfs_commit_write(struct file *file, struct page *page,
                       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2168,6 +2262,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size);
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void btrfs_update_iflags(struct inode *inode);
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 /* file.c */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
@@ -2205,8 +2301,20 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 /* acl.c */
+#ifdef CONFIG_FS_POSIX_ACL
 int btrfs_check_acl(struct inode *inode, int mask);
+#else
+#define btrfs_check_acl NULL
+#endif
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index d6c01c096a40..84e6781413b1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -29,27 +29,87 @@
 * add extents in the middle of btrfs_search_slot, and it allows
 * us to buffer up frequently modified backrefs in an rb tree instead
 * of hammering updates on the extent allocation tree.
- *
- * Right now this code is only used for reference counted trees, but
- * the long term goal is to get rid of the similar code for delayed
- * extent tree modifications.
 */
 /*
- * entries in the rb tree are ordered by the byte number of the extent
+ * compare two delayed tree backrefs with same bytenr and type
- * and by the byte number of the parent block.
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
+                          struct btrfs_delayed_tree_ref *ref1)
+{
+        if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+                if (ref1->root < ref2->root)
+                        return -1;
+                if (ref1->root > ref2->root)
+                        return 1;
+        } else {
+                if (ref1->parent < ref2->parent)
+                        return -1;
+                if (ref1->parent > ref2->parent)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * compare two delayed data backrefs with same bytenr and type
 */
-static int comp_entry(struct btrfs_delayed_ref_node *ref,
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
-                      u64 bytenr, u64 parent)
+                          struct btrfs_delayed_data_ref *ref1)
 {
-        if (bytenr < ref->bytenr)
+        if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                if (ref1->root < ref2->root)
+                        return -1;
+                if (ref1->root > ref2->root)
+                        return 1;
+                if (ref1->objectid < ref2->objectid)
+                        return -1;
+                if (ref1->objectid > ref2->objectid)
+                        return 1;
+                if (ref1->offset < ref2->offset)
+                        return -1;
+                if (ref1->offset > ref2->offset)
+                        return 1;
+        } else {
+                if (ref1->parent < ref2->parent)
+                        return -1;
+                if (ref1->parent > ref2->parent)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * entries in the rb tree are ordered by the byte number of the extent,
+ * type of the delayed backrefs and content of delayed backrefs.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref2,
+                      struct btrfs_delayed_ref_node *ref1)
+{
+        if (ref1->bytenr < ref2->bytenr)
                return -1;
-        if (bytenr > ref->bytenr)
+        if (ref1->bytenr > ref2->bytenr)
                return 1;
-        if (parent < ref->parent)
+        if (ref1->is_head && ref2->is_head)
+                return 0;
+        if (ref2->is_head)
                return -1;
-        if (parent > ref->parent)
+        if (ref1->is_head)
                return 1;
+        if (ref1->type < ref2->type)
+                return -1;
+        if (ref1->type > ref2->type)
+                return 1;
+        if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+            ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+                return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
+                                      btrfs_delayed_node_to_tree_ref(ref1));
+        } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
+                   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+                return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
+                                      btrfs_delayed_node_to_data_ref(ref1));
+        }
+        BUG();
        return 0;
 }
@@ -59,20 +119,21 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref,
 * inserted.
 */
 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
-                                                  u64 bytenr, u64 parent,
                                                  struct rb_node *node)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent_node = NULL;
        struct btrfs_delayed_ref_node *entry;
+        struct btrfs_delayed_ref_node *ins;
        int cmp;
+        ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
        while (*p) {
                parent_node = *p;
                entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
                                 rb_node);
-                cmp = comp_entry(entry, bytenr, parent);
+                cmp = comp_entry(entry, ins);
                if (cmp < 0)
                        p = &(*p)->rb_left;
                else if (cmp > 0)
@@ -81,18 +142,17 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
                        return entry;
        }
-        entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
        rb_link_node(node, parent_node, p);
        rb_insert_color(node, root);
        return NULL;
 }
 /*
- * find an entry based on (bytenr,parent).  This returns the delayed
+ * find an head entry based on bytenr. This returns the delayed ref
- * ref if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot
 */
-static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
-                                  u64 bytenr, u64 parent,
+                                  u64 bytenr,
                                  struct btrfs_delayed_ref_node **last)
 {
        struct rb_node *n = root->rb_node;
@@ -105,7 +165,15 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
                if (last)
                        *last = entry;
-                cmp = comp_entry(entry, bytenr, parent);
+                if (bytenr < entry->bytenr)
+                        cmp = -1;
+                else if (bytenr > entry->bytenr)
+                        cmp = 1;
+                else if (!btrfs_delayed_ref_is_head(entry))
+                        cmp = 1;
+                else
+                        cmp = 0;
                if (cmp < 0)
                        n = n->rb_left;
                else if (cmp > 0)
@@ -154,7 +222,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                node = rb_first(&delayed_refs->root);
        } else {
                ref = NULL;
-                tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+                find_ref_head(&delayed_refs->root, start, &ref);
                if (ref) {
                        struct btrfs_delayed_ref_node *tmp;
@@ -234,7 +302,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
-        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
        if (ref) {
                prev_node = rb_prev(&ref->rb_node);
                if (!prev_node)
@@ -250,25 +318,28 @@ out:
 }
 /*
- * helper function to lookup reference count
+ * helper function to lookup reference count and flags of extent.
 *
 * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree.  This way you
+ * reference count modifications queued up in the rbtree. the head
- * can check to see what the reference count would be if all of the
+ * node may also store the extent flags to set. This way you can check
- * delayed refs are processed.
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
 */
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
+                             struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u32 *refs)
+                             u64 num_bytes, u64 *refs, u64 *flags)
 {
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *head;
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_path *path;
-        struct extent_buffer *leaf;
        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
        struct btrfs_key key;
-        u32 num_refs;
+        u32 item_size;
+        u64 num_refs;
+        u64 extent_flags;
        int ret;
        path = btrfs_alloc_path();
@@ -287,37 +358,60 @@ again:
        if (ret == 0) {
                leaf = path->nodes[0];
-                ei = btrfs_item_ptr(leaf, path->slots[0],
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                                    struct btrfs_extent_item);
+                if (item_size >= sizeof(*ei)) {
-                num_refs = btrfs_extent_refs(leaf, ei);
+                        ei = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_extent_item);
+                        num_refs = btrfs_extent_refs(leaf, ei);
+                        extent_flags = btrfs_extent_flags(leaf, ei);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        struct btrfs_extent_item_v0 *ei0;
+                        BUG_ON(item_size != sizeof(*ei0));
+                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_item_v0);
+                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
+                        /* FIXME: this isn't correct for data */
+                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+                        BUG();
+#endif
+                }
+                BUG_ON(num_refs == 0);
        } else {
                num_refs = 0;
+                extent_flags = 0;
                ret = 0;
        }
        spin_lock(&delayed_refs->lock);
-        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
        if (ref) {
                head = btrfs_delayed_node_to_head(ref);
-                if (mutex_trylock(&head->mutex)) {
+                if (!mutex_trylock(&head->mutex)) {
-                        num_refs += ref->ref_mod;
+                        atomic_inc(&ref->refs);
-                        mutex_unlock(&head->mutex);
+                        spin_unlock(&delayed_refs->lock);
-                        *refs = num_refs;
-                        goto out;
-                }
-                atomic_inc(&ref->refs);
+                        btrfs_release_path(root->fs_info->extent_root, path);
-                spin_unlock(&delayed_refs->lock);
-                btrfs_release_path(root->fs_info->extent_root, path);
+                        mutex_lock(&head->mutex);
+                        mutex_unlock(&head->mutex);
+                        btrfs_put_delayed_ref(ref);
+                        goto again;
+                }
+                if (head->extent_op && head->extent_op->update_flags)
+                        extent_flags |= head->extent_op->flags_to_set;
+                else
+                        BUG_ON(num_refs == 0);
-                mutex_lock(&head->mutex);
+                num_refs += ref->ref_mod;
                mutex_unlock(&head->mutex);
-                btrfs_put_delayed_ref(ref);
-                goto again;
-        } else {
-                *refs = num_refs;
        }
+        WARN_ON(num_refs == 0);
+        if (refs)
+                *refs = num_refs;
+        if (flags)
+                *flags = extent_flags;
 out:
        spin_unlock(&delayed_refs->lock);
        btrfs_free_path(path);
@@ -338,16 +432,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
                    struct btrfs_delayed_ref_node *existing,
                    struct btrfs_delayed_ref_node *update)
 {
-        struct btrfs_delayed_ref *existing_ref;
+        if (update->action != existing->action) {
-        struct btrfs_delayed_ref *ref;
-        existing_ref = btrfs_delayed_node_to_ref(existing);
-        ref = btrfs_delayed_node_to_ref(update);
-        if (ref->pin)
-                existing_ref->pin = 1;
-        if (ref->action != existing_ref->action) {
                /*
                 * this is effectively undoing either an add or a
                 * drop.  We decrement the ref_mod, and if it goes
@@ -363,20 +448,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
                        delayed_refs->num_entries--;
                        if (trans->delayed_ref_updates)
                                trans->delayed_ref_updates--;
+                } else {
+                        WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+                                existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
                }
        } else {
-                if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+                WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
-                        /* if we're adding refs, make sure all the
+                        existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-                         * details match up.  The extent could
-                         * have been totally freed and reallocated
-                         * by a different owner before the delayed
-                         * ref entries were removed.
-                         */
-                        existing_ref->owner_objectid = ref->owner_objectid;
-                        existing_ref->generation = ref->generation;
-                        existing_ref->root = ref->root;
-                        existing->num_bytes = update->num_bytes;
-                }
                /*
                 * the action on the existing ref matches
                 * the action on the ref we're trying to add.
@@ -401,6 +479,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
        existing_ref = btrfs_delayed_node_to_head(existing);
        ref = btrfs_delayed_node_to_head(update);
+        BUG_ON(existing_ref->is_data != ref->is_data);
        if (ref->must_insert_reserved) {
                /* if the extent was freed and then
@@ -420,6 +499,24 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
        }
+        if (ref->extent_op) {
+                if (!existing_ref->extent_op) {
+                        existing_ref->extent_op = ref->extent_op;
+                } else {
+                        if (ref->extent_op->update_key) {
+                                memcpy(&existing_ref->extent_op->key,
+                                       &ref->extent_op->key,
+                                       sizeof(ref->extent_op->key));
+                                existing_ref->extent_op->update_key = 1;
+                        }
+                        if (ref->extent_op->update_flags) {
+                                existing_ref->extent_op->flags_to_set |=
+                                        ref->extent_op->flags_to_set;
+                                existing_ref->extent_op->update_flags = 1;
+                        }
+                        kfree(ref->extent_op);
+                }
+        }
        /*
         * update the reference mod on the head to reflect this new operation
         */
@@ -427,19 +524,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 }
 /*
- * helper function to actually insert a delayed ref into the rbtree.
+ * helper function to actually insert a head node into the rbtree.
 * this does all the dirty work in terms of maintaining the correct
- * overall modification count in the head node and properly dealing
+ * overall modification count.
- * with updating existing nodes as new modifications are queued.
 */
-static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
-                          struct btrfs_delayed_ref_node *ref,
+                                        struct btrfs_delayed_ref_node *ref,
-                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                                        u64 bytenr, u64 num_bytes,
-                          u64 ref_generation, u64 owner_objectid, int action,
+                                        int action, int is_data)
-                          int pin)
 {
        struct btrfs_delayed_ref_node *existing;
-        struct btrfs_delayed_ref *full_ref;
        struct btrfs_delayed_ref_head *head_ref = NULL;
        struct btrfs_delayed_ref_root *delayed_refs;
        int count_mod = 1;
@@ -449,12 +543,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
         * the head node stores the sum of all the mods, so dropping a ref
         * should drop the sum in the head node by one.
         */
-        if (parent == (u64)-1) {
+        if (action == BTRFS_UPDATE_DELAYED_HEAD)
-                if (action == BTRFS_DROP_DELAYED_REF)
+                count_mod = 0;
-                        count_mod = -1;
+        else if (action == BTRFS_DROP_DELAYED_REF)
-                else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+                count_mod = -1;
-                        count_mod = 0;
-        }
        /*
         * BTRFS_ADD_DELAYED_EXTENT means that we need to update
@@ -467,57 +559,148 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
         * Once we record must_insert_reserved, switch the action to
         * BTRFS_ADD_DELAYED_REF because other special casing is not required.
         */
-        if (action == BTRFS_ADD_DELAYED_EXTENT) {
+        if (action == BTRFS_ADD_DELAYED_EXTENT)
                must_insert_reserved = 1;
-                action = BTRFS_ADD_DELAYED_REF;
+        else
-        } else {
                must_insert_reserved = 0;
-        }
        delayed_refs = &trans->transaction->delayed_refs;
        /* first set the basic ref node struct up */
        atomic_set(&ref->refs, 1);
        ref->bytenr = bytenr;
-        ref->parent = parent;
+        ref->num_bytes = num_bytes;
        ref->ref_mod = count_mod;
+        ref->type  = 0;
+        ref->action  = 0;
+        ref->is_head = 1;
        ref->in_tree = 1;
+        head_ref = btrfs_delayed_node_to_head(ref);
+        head_ref->must_insert_reserved = must_insert_reserved;
+        head_ref->is_data = is_data;
+        INIT_LIST_HEAD(&head_ref->cluster);
+        mutex_init(&head_ref->mutex);
+        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+        if (existing) {
+                update_existing_head_ref(existing, ref);
+                /*
+                 * we've updated the existing ref, free the newly
+                 * allocated ref
+                 */
+                kfree(ref);
+        } else {
+                delayed_refs->num_heads++;
+                delayed_refs->num_heads_ready++;
+                delayed_refs->num_entries++;
+                trans->delayed_ref_updates++;
+        }
+        return 0;
+}
+/*
+ * helper to insert a delayed tree ref into the rbtree.
+ */
+static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+                                         struct btrfs_delayed_ref_node *ref,
+                                         u64 bytenr, u64 num_bytes, u64 parent,
+                                         u64 ref_root, int level, int action)
+{
+        struct btrfs_delayed_ref_node *existing;
+        struct btrfs_delayed_tree_ref *full_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        if (action == BTRFS_ADD_DELAYED_EXTENT)
+                action = BTRFS_ADD_DELAYED_REF;
+        delayed_refs = &trans->transaction->delayed_refs;
+        /* first set the basic ref node struct up */
+        atomic_set(&ref->refs, 1);
+        ref->bytenr = bytenr;
        ref->num_bytes = num_bytes;
+        ref->ref_mod = 1;
+        ref->action = action;
+        ref->is_head = 0;
+        ref->in_tree = 1;
-        if (btrfs_delayed_ref_is_head(ref)) {
+        full_ref = btrfs_delayed_node_to_tree_ref(ref);
-                head_ref = btrfs_delayed_node_to_head(ref);
+        if (parent) {
-                head_ref->must_insert_reserved = must_insert_reserved;
+                full_ref->parent = parent;
-                INIT_LIST_HEAD(&head_ref->cluster);
+                ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-                mutex_init(&head_ref->mutex);
        } else {
-                full_ref = btrfs_delayed_node_to_ref(ref);
                full_ref->root = ref_root;
-                full_ref->generation = ref_generation;
+                ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-                full_ref->owner_objectid = owner_objectid;
-                full_ref->pin = pin;
-                full_ref->action = action;
        }
+        full_ref->level = level;
-        existing = tree_insert(&delayed_refs->root, bytenr,
+        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-                               parent, &ref->rb_node);
        if (existing) {
-                if (btrfs_delayed_ref_is_head(ref))
+                update_existing_ref(trans, delayed_refs, existing, ref);
-                        update_existing_head_ref(existing, ref);
+                /*
-                else
+                 * we've updated the existing ref, free the newly
-                        update_existing_ref(trans, delayed_refs, existing, ref);
+                 * allocated ref
+                 */
+                kfree(ref);
+        } else {
+                delayed_refs->num_entries++;
+                trans->delayed_ref_updates++;
+        }
+        return 0;
+}
+/*
+ * helper to insert a delayed data ref into the rbtree.
+ */
+static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                                         struct btrfs_delayed_ref_node *ref,
+                                         u64 bytenr, u64 num_bytes, u64 parent,
+                                         u64 ref_root, u64 owner, u64 offset,
+                                         int action)
+{
+        struct btrfs_delayed_ref_node *existing;
+        struct btrfs_delayed_data_ref *full_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        if (action == BTRFS_ADD_DELAYED_EXTENT)
+                action = BTRFS_ADD_DELAYED_REF;
+        delayed_refs = &trans->transaction->delayed_refs;
+        /* first set the basic ref node struct up */
+        atomic_set(&ref->refs, 1);
+        ref->bytenr = bytenr;
+        ref->num_bytes = num_bytes;
+        ref->ref_mod = 1;
+        ref->action = action;
+        ref->is_head = 0;
+        ref->in_tree = 1;
+        full_ref = btrfs_delayed_node_to_data_ref(ref);
+        if (parent) {
+                full_ref->parent = parent;
+                ref->type = BTRFS_SHARED_DATA_REF_KEY;
+        } else {
+                full_ref->root = ref_root;
+                ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+        }
+        full_ref->objectid = owner;
+        full_ref->offset = offset;
+        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+        if (existing) {
+                update_existing_ref(trans, delayed_refs, existing, ref);
                /*
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
                kfree(ref);
        } else {
-                if (btrfs_delayed_ref_is_head(ref)) {
-                        delayed_refs->num_heads++;
-                        delayed_refs->num_heads_ready++;
-                }
                delayed_refs->num_entries++;
                trans->delayed_ref_updates++;
        }
@@ -525,37 +708,78 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 }
 /*
- * add a delayed ref to the tree.  This does all of the accounting required
+ * add a delayed tree ref.  This does all of the accounting required
 * to make sure the delayed ref is eventually processed before this
 * transaction commits.
 */
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                               u64 bytenr, u64 num_bytes, u64 parent,
-                          u64 ref_generation, u64 owner_objectid, int action,
+                               u64 ref_root,  int level, int action,
-                          int pin)
+                               struct btrfs_delayed_extent_op *extent_op)
 {
-        struct btrfs_delayed_ref *ref;
+        struct btrfs_delayed_tree_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
        int ret;
+        BUG_ON(extent_op && extent_op->is_data);
        ref = kmalloc(sizeof(*ref), GFP_NOFS);
        if (!ref)
                return -ENOMEM;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref) {
+                kfree(ref);
+                return -ENOMEM;
+        }
+        head_ref->extent_op = extent_op;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
        /*
-         * the parent = 0 case comes from cases where we don't actually
+         * insert both the head node and the new ref without dropping
-         * know the parent yet.  It will get updated later via a add/drop
+         * the spin lock
-         * pair.
         */
-        if (parent == 0)
+        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                parent = bytenr;
+                                   action, 0);
+        BUG_ON(ret);
+        ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
+                                   parent, ref_root, level, action);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                               u64 bytenr, u64 num_bytes,
+                               u64 parent, u64 ref_root,
+                               u64 owner, u64 offset, int action,
+                               struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_delayed_data_ref *ref;
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        BUG_ON(extent_op && !extent_op->is_data);
+        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        if (!ref)
+                return -ENOMEM;
        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
        if (!head_ref) {
                kfree(ref);
                return -ENOMEM;
        }
+        head_ref->extent_op = extent_op;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
@@ -563,14 +787,39 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-        ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                                      (u64)-1, 0, 0, 0, action, pin);
+                                   action, 1);
        BUG_ON(ret);
-        ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+        ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-                                      parent, ref_root, ref_generation,
+                                   parent, ref_root, owner, offset, action);
-                                      owner_objectid, action, pin);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                                u64 bytenr, u64 num_bytes,
+                                struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref)
+                return -ENOMEM;
+        head_ref->extent_op = extent_op;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+                                   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+                                   extent_op->is_data);
        BUG_ON(ret);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -587,7 +836,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
        struct btrfs_delayed_ref_root *delayed_refs;
        delayed_refs = &trans->transaction->delayed_refs;
-        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
        if (ref)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
@@ -603,6 +852,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 *
 * It is the same as doing a ref add and delete in two separate calls.
 */
+#if 0
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -666,3 +916,4 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
+#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 3bec2ff0b15c..f6fc67ddad36 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -30,9 +30,6 @@ struct btrfs_delayed_ref_node {
        /* the starting bytenr of the extent */
        u64 bytenr;
-        /* the parent our backref will point to */
-        u64 parent;
        /* the size of the extent */
        u64 num_bytes;
@@ -50,10 +47,21 @@ struct btrfs_delayed_ref_node {
         */
        int ref_mod;
+        unsigned int action:8;
+        unsigned int type:8;
        /* is this node still in the rbtree? */
+        unsigned int is_head:1;
        unsigned int in_tree:1;
 };
+struct btrfs_delayed_extent_op {
+        struct btrfs_disk_key key;
+        u64 flags_to_set;
+        unsigned int update_key:1;
+        unsigned int update_flags:1;
+        unsigned int is_data:1;
+};
 /*
 * the head refs are used to hold a lock on a given extent, which allows us
 * to make sure that only one process is running the delayed refs
@@ -71,6 +79,7 @@ struct btrfs_delayed_ref_head {
        struct list_head cluster;
+        struct btrfs_delayed_extent_op *extent_op;
        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
@@ -84,27 +93,26 @@ struct btrfs_delayed_ref_head {
         * the free has happened.
         */
        unsigned int must_insert_reserved:1;
+        unsigned int is_data:1;
 };
-struct btrfs_delayed_ref {
+struct btrfs_delayed_tree_ref {
        struct btrfs_delayed_ref_node node;
+        union {
+                u64 root;
+                u64 parent;
+        };
+        int level;
+};
-        /* the root objectid our ref will point to */
+struct btrfs_delayed_data_ref {
-        u64 root;
+        struct btrfs_delayed_ref_node node;
+        union {
-        /* the generation for the backref */
+                u64 root;
-        u64 generation;
+                u64 parent;
+        };
-        /* owner_objectid of the backref  */
+        u64 objectid;
-        u64 owner_objectid;
+        u64 offset;
-        /* operation done by this entry in the rbtree */
-        u8 action;
-        /* if pin == 1, when the extent is freed it will be pinned until
-         * transaction commit
-         */
-        unsigned int pin:1;
 };
 struct btrfs_delayed_ref_root {
@@ -143,17 +151,25 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
        }
 }
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                               u64 bytenr, u64 num_bytes, u64 parent,
-                          u64 ref_generation, u64 owner_objectid, int action,
+                               u64 ref_root, int level, int action,
-                          int pin);
+                               struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                               u64 bytenr, u64 num_bytes,
+                               u64 parent, u64 ref_root,
+                               u64 owner, u64 offset, int action,
+                               struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                                u64 bytenr, u64 num_bytes,
+                                struct btrfs_delayed_extent_op *extent_op);
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
+                             struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u32 *refs);
+                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -169,18 +185,24 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 */
 static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
 {
-        return node->parent == (u64)-1;
+        return node->is_head;
 }
 /*
 * helper functions to cast a node into its container
 */
-static inline struct btrfs_delayed_ref *
+static inline struct btrfs_delayed_tree_ref *
-btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
 {
        WARN_ON(btrfs_delayed_ref_is_head(node));
-        return container_of(node, struct btrfs_delayed_ref, node);
+        return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+        WARN_ON(btrfs_delayed_ref_is_head(node));
+        return container_of(node, struct btrfs_delayed_data_ref, node);
 }
 static inline struct btrfs_delayed_ref_head *
@@ -188,6 +210,5 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
 {
        WARN_ON(!btrfs_delayed_ref_is_head(node));
        return container_of(node, struct btrfs_delayed_ref_head, node);
 }
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b0ea0b80c23..0d50d49d990a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,8 +26,8 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/crc32c.h>
 #include "compat.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -36,7 +36,6 @@
 #include "print-tree.h"
 #include "async-thread.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
@@ -172,7 +171,7 @@ out:
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 {
-        return btrfs_crc32c(seed, data, len);
+        return crc32c(seed, data, len);
 }
 void btrfs_csum_final(u32 crc, char *result)
@@ -884,7 +883,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 {
        root->node = NULL;
        root->commit_root = NULL;
-        root->ref_tree = NULL;
        root->sectorsize = sectorsize;
        root->nodesize = nodesize;
        root->leafsize = leafsize;
@@ -899,12 +897,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->last_inode_alloc = 0;
        root->name = NULL;
        root->in_sysfs = 0;
+        root->inode_tree.rb_node = NULL;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
-        INIT_LIST_HEAD(&root->dead_list);
+        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->node_lock);
        spin_lock_init(&root->list_lock);
+        spin_lock_init(&root->inode_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -918,9 +918,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        extent_io_tree_init(&root->dirty_log_pages,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
-        btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
-        root->ref_tree = &root->ref_tree_struct;
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -959,6 +956,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
+        root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
        return 0;
 }
@@ -1025,20 +1023,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
         */
        root->ref_cows = 0;
-        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-                                      0, BTRFS_TREE_LOG_OBJECTID,
+                                      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
-                                      trans->transid, 0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
        }
+        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+        btrfs_set_header_bytenr(leaf, leaf->start);
+        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+        btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
        root->node = leaf;
-        btrfs_set_header_nritems(root->node, 0);
-        btrfs_set_header_level(root->node, 0);
-        btrfs_set_header_bytenr(root->node, root->node->start);
-        btrfs_set_header_generation(root->node, trans->transid);
-        btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
        write_extent_buffer(root->node, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(root->node),
@@ -1081,8 +1078,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
        inode_item->nbytes = cpu_to_le64(root->leafsize);
        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
-        btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
+        btrfs_set_root_node(&log_root->root_item, log_root->node);
-        btrfs_set_root_generation(&log_root->root_item, trans->transid);
        WARN_ON(root->log_root);
        root->log_root = log_root;
@@ -1144,6 +1140,7 @@ out:
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
+        root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
 insert:
        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1210,7 +1207,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
        }
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                            root->root_key.objectid, root);
+                                            root->root_key.objectid);
                BUG_ON(ret);
                btrfs_orphan_cleanup(root);
        }
@@ -1569,8 +1566,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
-        atomic_set(&fs_info->throttles, 0);
-        atomic_set(&fs_info->throttle_gen, 0);
        fs_info->sb = sb;
        fs_info->max_extent = (u64)-1;
        fs_info->max_inline = 8192 * 1024;
@@ -1598,6 +1593,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
                             fs_info->btree_inode->i_mapping,
                             GFP_NOFS);
@@ -1613,10 +1609,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
        fs_info->do_barriers = 1;
-        INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
-        btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
-        btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
        BTRFS_I(fs_info->btree_inode)->root = tree_root;
        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
               sizeof(struct btrfs_key));
@@ -1674,6 +1666,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                goto fail_iput;
        }
+        features = btrfs_super_incompat_flags(disk_super);
+        if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+                features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+                btrfs_set_super_incompat_flags(disk_super, features);
+        }
        features = btrfs_super_compat_ro_flags(disk_super) &
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
        if (!(sb->s_flags & MS_RDONLY) && features) {
@@ -1771,7 +1769,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read the system "
                       "array on %s\n", sb->s_id);
-                goto fail_sys_array;
+                goto fail_sb_buffer;
        }
        blocksize = btrfs_level_size(tree_root,
@@ -1785,6 +1783,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                           btrfs_super_chunk_root(disk_super),
                                           blocksize, generation);
        BUG_ON(!chunk_root->node);
+        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+        chunk_root->commit_root = btrfs_root_node(chunk_root);
        read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1810,7 +1810,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                          blocksize, generation);
        if (!tree_root->node)
                goto fail_chunk_root;
+        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+        tree_root->commit_root = btrfs_root_node(tree_root);
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1820,14 +1821,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
-        dev_root->track_dirty = 1;
        if (ret)
                goto fail_extent_root;
+        dev_root->track_dirty = 1;
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
        if (ret)
-                goto fail_extent_root;
+                goto fail_dev_root;
        csum_root->track_dirty = 1;
@@ -1849,6 +1850,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (IS_ERR(fs_info->transaction_kthread))
                goto fail_cleaner;
+        if (!btrfs_test_opt(tree_root, SSD) &&
+            !btrfs_test_opt(tree_root, NOSSD) &&
+            !fs_info->fs_devices->rotating) {
+                printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+                       "mode\n");
+                btrfs_set_opt(fs_info->mount_opt, SSD);
+        }
        if (btrfs_super_log_root(disk_super) != 0) {
                u64 bytenr = btrfs_super_log_root(disk_super);
@@ -1881,7 +1890,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        }
        if (!(sb->s_flags & MS_RDONLY)) {
-                ret = btrfs_cleanup_reloc_trees(tree_root);
+                ret = btrfs_recover_relocation(tree_root);
                BUG_ON(ret);
        }
@@ -1892,6 +1901,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (!fs_info->fs_root)
                goto fail_trans_kthread;
        return tree_root;
 fail_trans_kthread:
@@ -1908,14 +1918,19 @@ fail_cleaner:
 fail_csum_root:
        free_extent_buffer(csum_root->node);
+        free_extent_buffer(csum_root->commit_root);
+fail_dev_root:
+        free_extent_buffer(dev_root->node);
+        free_extent_buffer(dev_root->commit_root);
 fail_extent_root:
        free_extent_buffer(extent_root->node);
+        free_extent_buffer(extent_root->commit_root);
 fail_tree_root:
        free_extent_buffer(tree_root->node);
+        free_extent_buffer(tree_root->commit_root);
 fail_chunk_root:
        free_extent_buffer(chunk_root->node);
-fail_sys_array:
+        free_extent_buffer(chunk_root->commit_root);
-        free_extent_buffer(dev_root->node);
 fail_sb_buffer:
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -2005,6 +2020,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
        return latest;
 }
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1.  When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
 static int write_dev_supers(struct btrfs_device *device,
                            struct btrfs_super_block *sb,
                            int do_barriers, int wait, int max_mirrors)
@@ -2040,12 +2066,16 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh = __find_get_block(device->bdev, bytenr / 4096,
                                              BTRFS_SUPER_INFO_SIZE);
                        BUG_ON(!bh);
-                        brelse(bh);
                        wait_on_buffer(bh);
-                        if (buffer_uptodate(bh)) {
+                        if (!buffer_uptodate(bh))
-                                brelse(bh);
+                                errors++;
-                                continue;
-                        }
+                        /* drop our reference */
+                        brelse(bh);
+                        /* drop the reference from the wait == 0 run */
+                        brelse(bh);
+                        continue;
                } else {
                        btrfs_set_super_bytenr(sb, bytenr);
@@ -2056,12 +2086,18 @@ static int write_dev_supers(struct btrfs_device *device,
                                              BTRFS_CSUM_SIZE);
                        btrfs_csum_final(crc, sb->csum);
+                        /*
+                         * one reference for us, and we leave it for the
+                         * caller
+                         */
                        bh = __getblk(device->bdev, bytenr / 4096,
                                      BTRFS_SUPER_INFO_SIZE);
                        memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
-                        set_buffer_uptodate(bh);
+                        /* one reference for submit_bh */
                        get_bh(bh);
+                        set_buffer_uptodate(bh);
                        lock_buffer(bh);
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
@@ -2073,6 +2109,7 @@ static int write_dev_supers(struct btrfs_device *device,
                                       device->name);
                                set_buffer_uptodate(bh);
                                device->barriers = 0;
+                                /* one reference for submit_bh */
                                get_bh(bh);
                                lock_buffer(bh);
                                ret = submit_bh(WRITE_SYNC, bh);
@@ -2081,22 +2118,15 @@ static int write_dev_supers(struct btrfs_device *device,
                        ret = submit_bh(WRITE_SYNC, bh);
                }
-                if (!ret && wait) {
+                if (ret)
-                        wait_on_buffer(bh);
-                        if (!buffer_uptodate(bh))
-                                errors++;
-                } else if (ret) {
                        errors++;
-                }
-                if (wait)
-                        brelse(bh);
        }
        return errors < i ? 0 : -1;
 }
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-        struct list_head *head = &root->fs_info->fs_devices->devices;
+        struct list_head *head;
        struct btrfs_device *dev;
        struct btrfs_super_block *sb;
        struct btrfs_dev_item *dev_item;
@@ -2111,6 +2141,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        sb = &root->fs_info->super_for_commit;
        dev_item = &sb->dev_item;
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        head = &root->fs_info->fs_devices->devices;
        list_for_each_entry(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
@@ -2154,6 +2187,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
                if (ret)
                        total_errors++;
        }
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        if (total_errors > max_errors) {
                printk(KERN_ERR "btrfs: %d errors while writing supers\n",
                       total_errors);
@@ -2173,6 +2207,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
+        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
        radix_tree_delete(&fs_info->fs_roots_radix,
                          (unsigned long)root->root_key.objectid);
        if (root->anon_super.s_dev) {
@@ -2219,10 +2254,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
                                             ARRAY_SIZE(gang));
                if (!ret)
                        break;
+                root_objectid = gang[ret - 1]->root_key.objectid + 1;
                for (i = 0; i < ret; i++) {
                        root_objectid = gang[i]->root_key.objectid;
                        ret = btrfs_find_dead_roots(fs_info->tree_root,
-                                                    root_objectid, gang[i]);
+                                                    root_objectid);
                        BUG_ON(ret);
                        btrfs_orphan_cleanup(gang[i]);
                }
@@ -2278,20 +2315,16 @@ int close_ctree(struct btrfs_root *root)
                       (unsigned long long)fs_info->total_ref_cache_size);
        }
-        if (fs_info->extent_root->node)
+        free_extent_buffer(fs_info->extent_root->node);
-                free_extent_buffer(fs_info->extent_root->node);
+        free_extent_buffer(fs_info->extent_root->commit_root);
+        free_extent_buffer(fs_info->tree_root->node);
-        if (fs_info->tree_root->node)
+        free_extent_buffer(fs_info->tree_root->commit_root);
-                free_extent_buffer(fs_info->tree_root->node);
+        free_extent_buffer(root->fs_info->chunk_root->node);
+        free_extent_buffer(root->fs_info->chunk_root->commit_root);
-        if (root->fs_info->chunk_root->node)
+        free_extent_buffer(root->fs_info->dev_root->node);
-                free_extent_buffer(root->fs_info->chunk_root->node);
+        free_extent_buffer(root->fs_info->dev_root->commit_root);
+        free_extent_buffer(root->fs_info->csum_root->node);
-        if (root->fs_info->dev_root->node)
+        free_extent_buffer(root->fs_info->csum_root->commit_root);
-                free_extent_buffer(root->fs_info->dev_root->node);
-        if (root->fs_info->csum_root->node)
-                free_extent_buffer(root->fs_info->csum_root->node);
        btrfs_free_block_groups(root->fs_info);
@@ -2373,17 +2406,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
         * looks as though older kernels can get into trouble with
         * this code, they end up stuck in balance_dirty_pages forever
         */
-        struct extent_io_tree *tree;
        u64 num_dirty;
-        u64 start = 0;
        unsigned long thresh = 32 * 1024 * 1024;
-        tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
        if (current->flags & PF_MEMALLOC)
                return;
-        num_dirty = count_range_bits(tree, &start, (u64)-1,
+        num_dirty = root->fs_info->dirty_metadata_bytes;
-                                     thresh, EXTENT_DIRTY);
        if (num_dirty > thresh) {
                balance_dirty_pages_ratelimited_nr(
                                   root->fs_info->btree_inode->i_mapping, 1);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 85315d2c90de..9596b40caa4e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
-        inode = btrfs_iget(sb, &key, root, NULL);
+        inode = btrfs_iget(sb, &key, root);
        if (IS_ERR(inode))
                return (void *)inode;
@@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
-        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
 }
 const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 35af93355063..edc7d208c5ce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,50 +23,39 @@
 #include <linux/rcupdate.h>
 #include "compat.h"
 #include "hash.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "free-space-cache.h"
-#define PENDING_EXTENT_INSERT 0
-#define PENDING_EXTENT_DELETE 1
-#define PENDING_BACKREF_UPDATE 2
-struct pending_extent_op {
-        int type;
-        u64 bytenr;
-        u64 num_bytes;
-        u64 parent;
-        u64 orig_parent;
-        u64 generation;
-        u64 orig_generation;
-        int level;
-        struct list_head list;
-        int del;
-};
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root, u64 parent,
-                                         u64 root_objectid, u64 ref_generation,
-                                         u64 owner, struct btrfs_key *ins,
-                                         int ref_mod);
 static int update_reserved_extents(struct btrfs_root *root,
                                   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc,
                              int mark_free);
-static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
+                                struct btrfs_root *root,
-                                        u64 bytenr, u64 num_bytes, u64 parent,
+                                u64 bytenr, u64 num_bytes, u64 parent,
-                                        u64 root_objectid, u64 ref_generation,
+                                u64 root_objectid, u64 owner_objectid,
-                                        u64 owner_objectid, int pin,
+                                u64 owner_offset, int refs_to_drop,
-                                        int ref_to_drop);
+                                struct btrfs_delayed_extent_op *extra_op);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+                                    struct extent_buffer *leaf,
+                                    struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      u64 parent, u64 root_objectid,
+                                      u64 flags, u64 owner, u64 offset,
+                                      struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     u64 parent, u64 root_objectid,
+                                     u64 flags, struct btrfs_disk_key *key,
+                                     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -453,199 +442,969 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COW'd through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
 * - different files inside a single subvolume
 * - different offsets inside a file (bookend extents in file.c)
 *
- * The extent ref structure has fields for:
+ * The extent ref structure for the implicit back refs has fields for:
 *
 * - Objectid of the subvolume root
- * - Generation number of the tree holding the reference
 * - objectid of the file holding the reference
- * - number of references holding by parent node (alway 1 for tree blocks)
+ * - original offset in the file
- *
+ * - how many bookend extents
- * Btree leaf may hold multiple references to a file extent. In most cases,
- * these references are from same file and the corresponding offsets inside
- * the file are close together.
- *
- * When a file extent is allocated the fields are filled in:
- *     (root_key.objectid, trans->transid, inode objectid, 1)
 *
- * When a leaf is cow'd new references are added for every file extent found
+ * The key offset for the implicit back refs is hash of the first
- * in the leaf.  It looks similar to the create case, but trans->transid will
+ * three fields.
- * be different when the block is cow'd.
 *
- *     (root_key.objectid, trans->transid, inode objectid,
+ * The extent ref structure for the full back refs has field for:
- *      number of references in the leaf)
 *
- * When a file extent is removed either during snapshot deletion or
+ * - number of pointers in the tree leaf
- * file truncation, we find the corresponding back reference and check
- * the following fields:
 *
- *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ * The key offset for the implicit back refs is the first byte of
- *      inode objectid)
+ * the tree leaf
 *
- * Btree extents can be referenced by:
+ * When a file extent is allocated, The implicit back refs is used.
- *
+ * the fields are filled in:
- * - Different subvolumes
- * - Different generations of the same subvolume
- *
- * When a tree block is created, back references are inserted:
 *
- * (root->root_key.objectid, trans->transid, level, 1)
+ *     (root_key.objectid, inode objectid, offset in file, 1)
 *
- * When a tree block is cow'd, new back references are added for all the
+ * When a file extent is removed file truncation, we find the
- * blocks it points to. If the tree block isn't in reference counted root,
+ * corresponding implicit back refs and check the following fields:
- * the old back references are removed. These new back references are of
- * the form (trans->transid will have increased since creation):
 *
- * (root->root_key.objectid, trans->transid, level, 1)
+ *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 *
- * When a backref is in deleting, the following fields are checked:
+ * Btree extents can be referenced by:
 *
- * if backref was for a tree root:
+ * - Different subvolumes
- *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
- * else
- *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
 *
- * Back Reference Key composing:
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
 *
- * The key objectid corresponds to the first byte in the extent, the key
+ * When implicit back refs is used, information about the lowest key and
- * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * level of the tree block are required. These information are stored in
- * byte of parent extent. If a extent is tree root, the key offset is set
+ * tree block info structure.
- * to the key objectid.
 */
-static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                                          struct btrfs_root *root,
+static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
-                                          struct btrfs_path *path,
+                                  struct btrfs_root *root,
-                                          u64 bytenr, u64 parent,
+                                  struct btrfs_path *path,
-                                          u64 ref_root, u64 ref_generation,
+                                  u64 owner, u32 extra_size)
-                                          u64 owner_objectid, int del)
 {
+        struct btrfs_extent_item *item;
+        struct btrfs_extent_item_v0 *ei0;
+        struct btrfs_extent_ref_v0 *ref0;
+        struct btrfs_tree_block_info *bi;
+        struct extent_buffer *leaf;
        struct btrfs_key key;
-        struct btrfs_extent_ref *ref;
+        struct btrfs_key found_key;
+        u32 new_size = sizeof(*item);
+        u64 refs;
+        int ret;
+        leaf = path->nodes[0];
+        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                             struct btrfs_extent_item_v0);
+        refs = btrfs_extent_refs_v0(leaf, ei0);
+        if (owner == (u64)-1) {
+                while (1) {
+                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret < 0)
+                                        return ret;
+                                BUG_ON(ret > 0);
+                                leaf = path->nodes[0];
+                        }
+                        btrfs_item_key_to_cpu(leaf, &found_key,
+                                              path->slots[0]);
+                        BUG_ON(key.objectid != found_key.objectid);
+                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
+                                path->slots[0]++;
+                                continue;
+                        }
+                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                              struct btrfs_extent_ref_v0);
+                        owner = btrfs_ref_objectid_v0(leaf, ref0);
+                        break;
+                }
+        }
+        btrfs_release_path(root, path);
+        if (owner < BTRFS_FIRST_FREE_OBJECTID)
+                new_size += sizeof(*bi);
+        new_size -= sizeof(*ei0);
+        ret = btrfs_search_slot(trans, root, &key, path,
+                                new_size + extra_size, 1);
+        if (ret < 0)
+                return ret;
+        BUG_ON(ret);
+        ret = btrfs_extend_item(trans, root, path, new_size);
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        btrfs_set_extent_refs(leaf, item, refs);
+        /* FIXME: get real generation */
+        btrfs_set_extent_generation(leaf, item, 0);
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                btrfs_set_extent_flags(leaf, item,
+                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
+                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
+                bi = (struct btrfs_tree_block_info *)(item + 1);
+                /* FIXME: get first key of the block */
+                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+                btrfs_set_tree_block_level(leaf, bi, (int)owner);
+        } else {
+                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+#endif
+static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+        u32 high_crc = ~(u32)0;
+        u32 low_crc = ~(u32)0;
+        __le64 lenum;
+        lenum = cpu_to_le64(root_objectid);
+        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+        lenum = cpu_to_le64(owner);
+        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+        lenum = cpu_to_le64(offset);
+        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+        return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+                                     struct btrfs_extent_data_ref *ref)
+{
+        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+                                    btrfs_extent_data_ref_objectid(leaf, ref),
+                                    btrfs_extent_data_ref_offset(leaf, ref));
+}
+static int match_extent_data_ref(struct extent_buffer *leaf,
+                                 struct btrfs_extent_data_ref *ref,
+                                 u64 root_objectid, u64 owner, u64 offset)
+{
+        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+            btrfs_extent_data_ref_offset(leaf, ref) != offset)
+                return 0;
+        return 1;
+}
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+                                           struct btrfs_root *root,
+                                           struct btrfs_path *path,
+                                           u64 bytenr, u64 parent,
+                                           u64 root_objectid,
+                                           u64 owner, u64 offset)
+{
+        struct btrfs_key key;
+        struct btrfs_extent_data_ref *ref;
        struct extent_buffer *leaf;
-        u64 ref_objectid;
+        u32 nritems;
        int ret;
+        int recow;
+        int err = -ENOENT;
        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_REF_KEY;
+        if (parent) {
-        key.offset = parent;
+                key.type = BTRFS_SHARED_DATA_REF_KEY;
+                key.offset = parent;
+        } else {
+                key.type = BTRFS_EXTENT_DATA_REF_KEY;
+                key.offset = hash_extent_data_ref(root_objectid,
+                                                  owner, offset);
+        }
+again:
+        recow = 0;
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0) {
+                err = ret;
+                goto fail;
+        }
-        ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+        if (parent) {
-        if (ret < 0)
+                if (!ret)
-                goto out;
+                        return 0;
-        if (ret > 0) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                ret = -ENOENT;
+                key.type = BTRFS_EXTENT_REF_V0_KEY;
-                goto out;
+                btrfs_release_path(root, path);
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto fail;
+                }
+                if (!ret)
+                        return 0;
+#endif
+                goto fail;
        }
        leaf = path->nodes[0];
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        nritems = btrfs_header_nritems(leaf);
-        ref_objectid = btrfs_ref_objectid(leaf, ref);
+        while (1) {
-        if (btrfs_ref_root(leaf, ref) != ref_root ||
+                if (path->slots[0] >= nritems) {
-            btrfs_ref_generation(leaf, ref) != ref_generation ||
+                        ret = btrfs_next_leaf(root, path);
-            (ref_objectid != owner_objectid &&
+                        if (ret < 0)
-             ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+                                err = ret;
-                ret = -EIO;
+                        if (ret)
-                WARN_ON(1);
+                                goto fail;
-                goto out;
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        recow = 1;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != bytenr ||
+                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
+                        goto fail;
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_data_ref);
+                if (match_extent_data_ref(leaf, ref, root_objectid,
+                                          owner, offset)) {
+                        if (recow) {
+                                btrfs_release_path(root, path);
+                                goto again;
+                        }
+                        err = 0;
+                        break;
+                }
+                path->slots[0]++;
        }
-        ret = 0;
+fail:
-out:
+        return err;
-        return ret;
 }
-static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root,
+                                           struct btrfs_root *root,
-                                          struct btrfs_path *path,
+                                           struct btrfs_path *path,
-                                          u64 bytenr, u64 parent,
+                                           u64 bytenr, u64 parent,
-                                          u64 ref_root, u64 ref_generation,
+                                           u64 root_objectid, u64 owner,
-                                          u64 owner_objectid,
+                                           u64 offset, int refs_to_add)
-                                          int refs_to_add)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
-        struct btrfs_extent_ref *ref;
+        u32 size;
        u32 num_refs;
        int ret;
        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_REF_KEY;
+        if (parent) {
-        key.offset = parent;
+                key.type = BTRFS_SHARED_DATA_REF_KEY;
+                key.offset = parent;
+                size = sizeof(struct btrfs_shared_data_ref);
+        } else {
+                key.type = BTRFS_EXTENT_DATA_REF_KEY;
+                key.offset = hash_extent_data_ref(root_objectid,
+                                                  owner, offset);
+                size = sizeof(struct btrfs_extent_data_ref);
+        }
-        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
-        if (ret == 0) {
+        if (ret && ret != -EEXIST)
-                leaf = path->nodes[0];
+                goto fail;
-                ref = btrfs_item_ptr(leaf, path->slots[0],
-                                     struct btrfs_extent_ref);
+        leaf = path->nodes[0];
-                btrfs_set_ref_root(leaf, ref, ref_root);
+        if (parent) {
-                btrfs_set_ref_generation(leaf, ref, ref_generation);
+                struct btrfs_shared_data_ref *ref;
-                btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-                btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
-        } else if (ret == -EEXIST) {
-                u64 existing_owner;
-                BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
-                leaf = path->nodes[0];
                ref = btrfs_item_ptr(leaf, path->slots[0],
-                                     struct btrfs_extent_ref);
+                                     struct btrfs_shared_data_ref);
-                if (btrfs_ref_root(leaf, ref) != ref_root ||
+                if (ret == 0) {
-                    btrfs_ref_generation(leaf, ref) != ref_generation) {
+                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
-                        ret = -EIO;
+                } else {
-                        WARN_ON(1);
+                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
-                        goto out;
+                        num_refs += refs_to_add;
+                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
                }
+        } else {
+                struct btrfs_extent_data_ref *ref;
+                while (ret == -EEXIST) {
+                        ref = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_data_ref);
+                        if (match_extent_data_ref(leaf, ref, root_objectid,
+                                                  owner, offset))
+                                break;
+                        btrfs_release_path(root, path);
+                        key.offset++;
+                        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                                      size);
+                        if (ret && ret != -EEXIST)
+                                goto fail;
-                num_refs = btrfs_ref_num_refs(leaf, ref);
+                        leaf = path->nodes[0];
-                BUG_ON(num_refs == 0);
+                }
-                btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
+                ref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_data_ref);
-                existing_owner = btrfs_ref_objectid(leaf, ref);
+                if (ret == 0) {
-                if (existing_owner != owner_objectid &&
+                        btrfs_set_extent_data_ref_root(leaf, ref,
-                    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+                                                       root_objectid);
-                        btrfs_set_ref_objectid(leaf, ref,
+                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
-                                        BTRFS_MULTIPLE_OBJECTIDS);
+                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+                } else {
+                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
+                        num_refs += refs_to_add;
+                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
                }
-                ret = 0;
-        } else {
-                goto out;
        }
-        btrfs_unlock_up_safe(path, 1);
+        btrfs_mark_buffer_dirty(leaf);
-        btrfs_mark_buffer_dirty(path->nodes[0]);
+        ret = 0;
-out:
+fail:
        btrfs_release_path(root, path);
        return ret;
 }
-static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
-                                          struct btrfs_root *root,
+                                           struct btrfs_root *root,
-                                          struct btrfs_path *path,
+                                           struct btrfs_path *path,
-                                          int refs_to_drop)
+                                           int refs_to_drop)
 {
+        struct btrfs_key key;
+        struct btrfs_extent_data_ref *ref1 = NULL;
+        struct btrfs_shared_data_ref *ref2 = NULL;
        struct extent_buffer *leaf;
-        struct btrfs_extent_ref *ref;
+        u32 num_refs = 0;
-        u32 num_refs;
        int ret = 0;
        leaf = path->nodes[0];
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        num_refs = btrfs_ref_num_refs(leaf, ref);
+        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                ref1 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_data_ref);
+                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                ref2 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_shared_data_ref);
+                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                struct btrfs_extent_ref_v0 *ref0;
+                ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_ref_v0);
+                num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+        } else {
+                BUG();
+        }
        BUG_ON(num_refs < refs_to_drop);
        num_refs -= refs_to_drop;
        if (num_refs == 0) {
                ret = btrfs_del_item(trans, root, path);
        } else {
-                btrfs_set_ref_num_refs(leaf, ref, num_refs);
+                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                else {
+                        struct btrfs_extent_ref_v0 *ref0;
+                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_extent_ref_v0);
+                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
+                }
+#endif
                btrfs_mark_buffer_dirty(leaf);
        }
+        return ret;
+}
+static noinline u32 extent_data_ref_count(struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          struct btrfs_extent_inline_ref *iref)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_data_ref *ref1;
+        struct btrfs_shared_data_ref *ref2;
+        u32 num_refs = 0;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+        if (iref) {
+                if (btrfs_extent_inline_ref_type(leaf, iref) ==
+                    BTRFS_EXTENT_DATA_REF_KEY) {
+                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+                } else {
+                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+                }
+        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                ref1 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_data_ref);
+                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                ref2 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_shared_data_ref);
+                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                struct btrfs_extent_ref_v0 *ref0;
+                ref0 = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_extent_ref_v0);
+                num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+        } else {
+                WARN_ON(1);
+        }
+        return num_refs;
+}
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 root_objectid)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = bytenr;
+        if (parent) {
+                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+                key.offset = parent;
+        } else {
+                key.type = BTRFS_TREE_BLOCK_REF_KEY;
+                key.offset = root_objectid;
+        }
+        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret > 0)
+                ret = -ENOENT;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (ret == -ENOENT && parent) {
+                btrfs_release_path(root, path);
+                key.type = BTRFS_EXTENT_REF_V0_KEY;
+                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+                if (ret > 0)
+                        ret = -ENOENT;
+        }
+#endif
+        return ret;
+}
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+                                          struct btrfs_root *root,
+                                          struct btrfs_path *path,
+                                          u64 bytenr, u64 parent,
+                                          u64 root_objectid)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = bytenr;
+        if (parent) {
+                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+                key.offset = parent;
+        } else {
+                key.type = BTRFS_TREE_BLOCK_REF_KEY;
+                key.offset = root_objectid;
+        }
+        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
        btrfs_release_path(root, path);
        return ret;
 }
+static inline int extent_ref_type(u64 parent, u64 owner)
+{
+        int type;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                if (parent > 0)
+                        type = BTRFS_SHARED_BLOCK_REF_KEY;
+                else
+                        type = BTRFS_TREE_BLOCK_REF_KEY;
+        } else {
+                if (parent > 0)
+                        type = BTRFS_SHARED_DATA_REF_KEY;
+                else
+                        type = BTRFS_EXTENT_DATA_REF_KEY;
+        }
+        return type;
+}
+static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
+{
+        int level;
+        BUG_ON(!path->keep_locks);
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                if (!path->nodes[level])
+                        break;
+                btrfs_assert_tree_locked(path->nodes[level]);
+                if (path->slots[level] + 1 >=
+                    btrfs_header_nritems(path->nodes[level]))
+                        continue;
+                if (level == 0)
+                        btrfs_item_key_to_cpu(path->nodes[level], key,
+                                              path->slots[level] + 1);
+                else
+                        btrfs_node_key_to_cpu(path->nodes[level], key,
+                                              path->slots[level] + 1);
+                return 0;
+        }
+        return 1;
+}
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ *       items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref **ref_ret,
+                                 u64 bytenr, u64 num_bytes,
+                                 u64 parent, u64 root_objectid,
+                                 u64 owner, u64 offset, int insert)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *iref;
+        u64 flags;
+        u64 item_size;
+        unsigned long ptr;
+        unsigned long end;
+        int extra_size;
+        int type;
+        int want;
+        int ret;
+        int err = 0;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = num_bytes;
+        want = extent_ref_type(parent, owner);
+        if (insert) {
+                extra_size = btrfs_extent_inline_ref_size(want);
+                path->keep_locks = 1;
+        } else
+                extra_size = -1;
+        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                if (!insert) {
+                        err = -ENOENT;
+                        goto out;
+                }
+                ret = convert_extent_item_v0(trans, root, path, owner,
+                                             extra_size);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        }
+#endif
+        BUG_ON(item_size < sizeof(*ei));
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        flags = btrfs_extent_flags(leaf, ei);
+        ptr = (unsigned long)(ei + 1);
+        end = (unsigned long)ei + item_size;
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                ptr += sizeof(struct btrfs_tree_block_info);
+                BUG_ON(ptr > end);
+        } else {
+                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+        }
+        err = -ENOENT;
+        while (1) {
+                if (ptr >= end) {
+                        WARN_ON(ptr > end);
+                        break;
+                }
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                type = btrfs_extent_inline_ref_type(leaf, iref);
+                if (want < type)
+                        break;
+                if (want > type) {
+                        ptr += btrfs_extent_inline_ref_size(type);
+                        continue;
+                }
+                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        struct btrfs_extent_data_ref *dref;
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        if (match_extent_data_ref(leaf, dref, root_objectid,
+                                                  owner, offset)) {
+                                err = 0;
+                                break;
+                        }
+                        if (hash_extent_data_ref_item(leaf, dref) <
+                            hash_extent_data_ref(root_objectid, owner, offset))
+                                break;
+                } else {
+                        u64 ref_offset;
+                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+                        if (parent > 0) {
+                                if (parent == ref_offset) {
+                                        err = 0;
+                                        break;
+                                }
+                                if (ref_offset < parent)
+                                        break;
+                        } else {
+                                if (root_objectid == ref_offset) {
+                                        err = 0;
+                                        break;
+                                }
+                                if (ref_offset < root_objectid)
+                                        break;
+                        }
+                }
+                ptr += btrfs_extent_inline_ref_size(type);
+        }
+        if (err == -ENOENT && insert) {
+                if (item_size + extra_size >=
+                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+                        err = -EAGAIN;
+                        goto out;
+                }
+                /*
+                 * To add new inline back ref, we have to make sure
+                 * there is no corresponding back ref item.
+                 * For simplicity, we just do not add new inline back
+                 * ref if there is any kind of item for this block
+                 */
+                if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
+                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+                        err = -EAGAIN;
+                        goto out;
+                }
+        }
+        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+        if (insert) {
+                path->keep_locks = 0;
+                btrfs_unlock_up_safe(path, 1);
+        }
+        return err;
+}
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct btrfs_extent_inline_ref *iref,
+                                u64 parent, u64 root_objectid,
+                                u64 owner, u64 offset, int refs_to_add,
+                                struct btrfs_delayed_extent_op *extent_op)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        unsigned long ptr;
+        unsigned long end;
+        unsigned long item_offset;
+        u64 refs;
+        int size;
+        int type;
+        int ret;
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        item_offset = (unsigned long)iref - (unsigned long)ei;
+        type = extent_ref_type(parent, owner);
+        size = btrfs_extent_inline_ref_size(type);
+        ret = btrfs_extend_item(trans, root, path, size);
+        BUG_ON(ret);
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        refs += refs_to_add;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        if (extent_op)
+                __run_delayed_extent_op(extent_op, leaf, ei);
+        ptr = (unsigned long)ei + item_offset;
+        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+        if (ptr < end - size)
+                memmove_extent_buffer(leaf, ptr + size, ptr,
+                                      end - size - ptr);
+        iref = (struct btrfs_extent_inline_ref *)ptr;
+        btrfs_set_extent_inline_ref_type(leaf, iref, type);
+        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                struct btrfs_extent_data_ref *dref;
+                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+                struct btrfs_shared_data_ref *sref;
+                sref = (struct btrfs_shared_data_ref *)(iref + 1);
+                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+        } else {
+                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref **ref_ret,
+                                 u64 bytenr, u64 num_bytes, u64 parent,
+                                 u64 root_objectid, u64 owner, u64 offset)
+{
+        int ret;
+        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
+                                           bytenr, num_bytes, parent,
+                                           root_objectid, owner, offset, 0);
+        if (ret != -ENOENT)
+                return ret;
+        btrfs_release_path(root, path);
+        *ref_ret = NULL;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
+                                            root_objectid);
+        } else {
+                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
+                                             root_objectid, owner, offset);
+        }
+        return ret;
+}
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+int update_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref *iref,
+                                 int refs_to_mod,
+                                 struct btrfs_delayed_extent_op *extent_op)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_data_ref *dref = NULL;
+        struct btrfs_shared_data_ref *sref = NULL;
+        unsigned long ptr;
+        unsigned long end;
+        u32 item_size;
+        int size;
+        int type;
+        int ret;
+        u64 refs;
+        leaf = path->nodes[0];
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, ei);
+        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+        refs += refs_to_mod;
+        btrfs_set_extent_refs(leaf, ei, refs);
+        if (extent_op)
+                __run_delayed_extent_op(extent_op, leaf, ei);
+        type = btrfs_extent_inline_ref_type(leaf, iref);
+        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                refs = btrfs_extent_data_ref_count(leaf, dref);
+        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+                sref = (struct btrfs_shared_data_ref *)(iref + 1);
+                refs = btrfs_shared_data_ref_count(leaf, sref);
+        } else {
+                refs = 1;
+                BUG_ON(refs_to_mod != -1);
+        }
+        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+        refs += refs_to_mod;
+        if (refs > 0) {
+                if (type == BTRFS_EXTENT_DATA_REF_KEY)
+                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
+                else
+                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
+        } else {
+                size =  btrfs_extent_inline_ref_size(type);
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                ptr = (unsigned long)iref;
+                end = (unsigned long)ei + item_size;
+                if (ptr + size < end)
+                        memmove_extent_buffer(leaf, ptr, ptr + size,
+                                              end - ptr - size);
+                item_size -= size;
+                ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+                BUG_ON(ret);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 u64 bytenr, u64 num_bytes, u64 parent,
+                                 u64 root_objectid, u64 owner,
+                                 u64 offset, int refs_to_add,
+                                 struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_extent_inline_ref *iref;
+        int ret;
+        ret = lookup_inline_extent_backref(trans, root, path, &iref,
+                                           bytenr, num_bytes, parent,
+                                           root_objectid, owner, offset, 1);
+        if (ret == 0) {
+                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+                ret = update_inline_extent_backref(trans, root, path, iref,
+                                                   refs_to_add, extent_op);
+        } else if (ret == -ENOENT) {
+                ret = setup_inline_extent_backref(trans, root, path, iref,
+                                                  parent, root_objectid,
+                                                  owner, offset, refs_to_add,
+                                                  extent_op);
+        }
+        return ret;
+}
+static int insert_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 u64 bytenr, u64 parent, u64 root_objectid,
+                                 u64 owner, u64 offset, int refs_to_add)
+{
+        int ret;
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+                BUG_ON(refs_to_add != 1);
+                ret = insert_tree_block_ref(trans, root, path, bytenr,
+                                            parent, root_objectid);
+        } else {
+                ret = insert_extent_data_ref(trans, root, path, bytenr,
+                                             parent, root_objectid,
+                                             owner, offset, refs_to_add);
+        }
+        return ret;
+}
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_path *path,
+                                 struct btrfs_extent_inline_ref *iref,
+                                 int refs_to_drop, int is_data)
+{
+        int ret;
+        BUG_ON(!is_data && refs_to_drop != 1);
+        if (iref) {
+                ret = update_inline_extent_backref(trans, root, path, iref,
+                                                   -refs_to_drop, NULL);
+        } else if (is_data) {
+                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+        } else {
+                ret = btrfs_del_item(trans, root, path);
+        }
+        return ret;
+}
 #ifdef BIO_RW_DISCARD
 static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
@@ -686,71 +1445,40 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
-static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root, u64 bytenr,
+                         struct btrfs_root *root,
-                                     u64 num_bytes,
+                         u64 bytenr, u64 num_bytes, u64 parent,
-                                     u64 orig_parent, u64 parent,
+                         u64 root_objectid, u64 owner, u64 offset)
-                                     u64 orig_root, u64 ref_root,
-                                     u64 orig_generation, u64 ref_generation,
-                                     u64 owner_objectid)
 {
        int ret;
-        int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
+        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
+               root_objectid == BTRFS_TREE_LOG_OBJECTID);
-        ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-                                       orig_parent, parent, orig_root,
+                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
-                                       ref_root, orig_generation,
+                                        parent, root_objectid, (int)owner,
-                                       ref_generation, owner_objectid, pin);
+                                        BTRFS_ADD_DELAYED_REF, NULL);
-        BUG_ON(ret);
+        } else {
+                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+                                        parent, root_objectid, owner, offset,
+                                        BTRFS_ADD_DELAYED_REF, NULL);
+        }
        return ret;
 }
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u64 orig_parent, u64 parent,
-                            u64 ref_root, u64 ref_generation,
-                            u64 owner_objectid)
-{
-        int ret;
-        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
-            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-                return 0;
-        ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-                                        orig_parent, parent, ref_root,
-                                        ref_root, ref_generation,
-                                        ref_generation, owner_objectid);
-        return ret;
-}
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-                                  struct btrfs_root *root, u64 bytenr,
+                                  struct btrfs_root *root,
-                                  u64 num_bytes,
+                                  u64 bytenr, u64 num_bytes,
-                                  u64 orig_parent, u64 parent,
+                                  u64 parent, u64 root_objectid,
-                                  u64 orig_root, u64 ref_root,
+                                  u64 owner, u64 offset, int refs_to_add,
-                                  u64 orig_generation, u64 ref_generation,
+                                  struct btrfs_delayed_extent_op *extent_op)
-                                  u64 owner_objectid)
-{
-        int ret;
-        ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
-                                    ref_generation, owner_objectid,
-                                    BTRFS_ADD_DELAYED_REF, 0);
-        BUG_ON(ret);
-        return ret;
-}
-static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 bytenr,
-                          u64 num_bytes, u64 parent, u64 ref_root,
-                          u64 ref_generation, u64 owner_objectid,
-                          int refs_to_add)
 {
        struct btrfs_path *path;
-        int ret;
+        struct extent_buffer *leaf;
-        struct btrfs_key key;
-        struct extent_buffer *l;
        struct btrfs_extent_item *item;
-        u32 refs;
+        u64 refs;
+        int ret;
+        int err = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -758,43 +1486,27 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
        path->reada = 1;
        path->leave_spinning = 1;
-        key.objectid = bytenr;
+        /* this will setup the path even if it fails to insert the back ref */
-        key.type = BTRFS_EXTENT_ITEM_KEY;
+        ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
-        key.offset = num_bytes;
+                                           path, bytenr, num_bytes, parent,
+                                           root_objectid, owner, offset,
-        /* first find the extent item and update its reference count */
+                                           refs_to_add, extent_op);
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+        if (ret == 0)
-                                path, 0, 1);
+                goto out;
-        if (ret < 0) {
-                btrfs_set_path_blocking(path);
-                return ret;
-        }
-        if (ret > 0) {
-                WARN_ON(1);
-                btrfs_free_path(path);
-                return -EIO;
-        }
-        l = path->nodes[0];
-        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+        if (ret != -EAGAIN) {
-        if (key.objectid != bytenr) {
+                err = ret;
-                btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+                goto out;
-                printk(KERN_ERR "btrfs wanted %llu found %llu\n",
-                       (unsigned long long)bytenr,
-                       (unsigned long long)key.objectid);
-                BUG();
        }
-        BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
-        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-        refs = btrfs_extent_refs(l, item);
-        btrfs_set_extent_refs(l, item, refs + refs_to_add);
-        btrfs_unlock_up_safe(path, 1);
-        btrfs_mark_buffer_dirty(path->nodes[0]);
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        refs = btrfs_extent_refs(leaf, item);
+        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
+        if (extent_op)
+                __run_delayed_extent_op(extent_op, leaf, item);
+        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(root->fs_info->extent_root, path);
        path->reada = 1;
@@ -802,56 +1514,197 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
        /* now insert the actual backref */
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
-                                    path, bytenr, parent,
+                                    path, bytenr, parent, root_objectid,
-                                    ref_root, ref_generation,
+                                    owner, offset, refs_to_add);
-                                    owner_objectid, refs_to_add);
        BUG_ON(ret);
+out:
        btrfs_free_path(path);
-        return 0;
+        return err;
 }
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root,
+                                struct btrfs_root *root,
-                         u64 bytenr, u64 num_bytes, u64 parent,
+                                struct btrfs_delayed_ref_node *node,
-                         u64 ref_root, u64 ref_generation,
+                                struct btrfs_delayed_extent_op *extent_op,
-                         u64 owner_objectid)
+                                int insert_reserved)
 {
-        int ret;
+        int ret = 0;
-        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+        struct btrfs_delayed_data_ref *ref;
-            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+        struct btrfs_key ins;
-                return 0;
+        u64 parent = 0;
+        u64 ref_root = 0;
+        u64 flags = 0;
-        ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
+        ins.objectid = node->bytenr;
-                                     0, ref_root, 0, ref_generation,
+        ins.offset = node->num_bytes;
-                                     owner_objectid);
+        ins.type = BTRFS_EXTENT_ITEM_KEY;
+        ref = btrfs_delayed_node_to_data_ref(node);
+        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+                parent = ref->parent;
+        else
+                ref_root = ref->root;
+        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+                if (extent_op) {
+                        BUG_ON(extent_op->update_key);
+                        flags |= extent_op->flags_to_set;
+                }
+                ret = alloc_reserved_file_extent(trans, root,
+                                                 parent, ref_root, flags,
+                                                 ref->objectid, ref->offset,
+                                                 &ins, node->ref_mod);
+                update_reserved_extents(root, ins.objectid, ins.offset, 0);
+        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+                                             node->num_bytes, parent,
+                                             ref_root, ref->objectid,
+                                             ref->offset, node->ref_mod,
+                                             extent_op);
+        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+                ret = __btrfs_free_extent(trans, root, node->bytenr,
+                                          node->num_bytes, parent,
+                                          ref_root, ref->objectid,
+                                          ref->offset, node->ref_mod,
+                                          extent_op);
+        } else {
+                BUG();
+        }
        return ret;
 }
-static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
-                                        struct btrfs_root *root,
+                                    struct extent_buffer *leaf,
-                                        struct btrfs_delayed_ref_node *node)
+                                    struct btrfs_extent_item *ei)
+{
+        u64 flags = btrfs_extent_flags(leaf, ei);
+        if (extent_op->update_flags) {
+                flags |= extent_op->flags_to_set;
+                btrfs_set_extent_flags(leaf, ei, flags);
+        }
+        if (extent_op->update_key) {
+                struct btrfs_tree_block_info *bi;
+                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+                bi = (struct btrfs_tree_block_info *)(ei + 1);
+                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+        }
+}
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 struct btrfs_delayed_ref_node *node,
+                                 struct btrfs_delayed_extent_op *extent_op)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
+        u32 item_size;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = node->bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = node->num_bytes;
+        path->reada = 1;
+        path->leave_spinning = 1;
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+                                path, 0, 1);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret > 0) {
+                err = -EIO;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+                                             path, (u64)-1, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        }
+#endif
+        BUG_ON(item_size < sizeof(*ei));
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+        __run_delayed_extent_op(extent_op, leaf, ei);
+        btrfs_mark_buffer_dirty(leaf);
+out:
+        btrfs_free_path(path);
+        return err;
+}
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_delayed_ref_node *node,
+                                struct btrfs_delayed_extent_op *extent_op,
+                                int insert_reserved)
 {
        int ret = 0;
-        struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+        struct btrfs_delayed_tree_ref *ref;
+        struct btrfs_key ins;
+        u64 parent = 0;
+        u64 ref_root = 0;
-        BUG_ON(node->ref_mod == 0);
+        ins.objectid = node->bytenr;
-        ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+        ins.offset = node->num_bytes;
-                                  node->parent, ref->root, ref->generation,
+        ins.type = BTRFS_EXTENT_ITEM_KEY;
-                                  ref->owner_objectid, ref->pin, node->ref_mod);
+        ref = btrfs_delayed_node_to_tree_ref(node);
+        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+                parent = ref->parent;
+        else
+                ref_root = ref->root;
+        BUG_ON(node->ref_mod != 1);
+        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+                BUG_ON(!extent_op || !extent_op->update_flags ||
+                       !extent_op->update_key);
+                ret = alloc_reserved_tree_block(trans, root,
+                                                parent, ref_root,
+                                                extent_op->flags_to_set,
+                                                &extent_op->key,
+                                                ref->level, &ins);
+                update_reserved_extents(root, ins.objectid, ins.offset, 0);
+        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
+                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+                                             node->num_bytes, parent, ref_root,
+                                             ref->level, 0, 1, extent_op);
+        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
+                ret = __btrfs_free_extent(trans, root, node->bytenr,
+                                          node->num_bytes, parent, ref_root,
+                                          ref->level, 0, 1, extent_op);
+        } else {
+                BUG();
+        }
        return ret;
 }
 /* helper function to actually process a single delayed ref entry */
-static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
+                               struct btrfs_root *root,
-                                        struct btrfs_delayed_ref_node *node,
+                               struct btrfs_delayed_ref_node *node,
-                                        int insert_reserved)
+                               struct btrfs_delayed_extent_op *extent_op,
+                               int insert_reserved)
 {
        int ret;
-        struct btrfs_delayed_ref *ref;
+        if (btrfs_delayed_ref_is_head(node)) {
-        if (node->parent == (u64)-1) {
                struct btrfs_delayed_ref_head *head;
                /*
                 * we've hit the end of the chain and we were supposed
@@ -859,44 +1712,35 @@ static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                 * deleted before we ever needed to insert it, so all
                 * we have to do is clean up the accounting
                 */
+                BUG_ON(extent_op);
+                head = btrfs_delayed_node_to_head(node);
                if (insert_reserved) {
+                        if (head->is_data) {
+                                ret = btrfs_del_csums(trans, root,
+                                                      node->bytenr,
+                                                      node->num_bytes);
+                                BUG_ON(ret);
+                        }
+                        btrfs_update_pinned_extents(root, node->bytenr,
+                                                    node->num_bytes, 1);
                        update_reserved_extents(root, node->bytenr,
                                                node->num_bytes, 0);
                }
-                head = btrfs_delayed_node_to_head(node);
                mutex_unlock(&head->mutex);
                return 0;
        }
-        ref = btrfs_delayed_node_to_ref(node);
+        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
-        if (ref->action == BTRFS_ADD_DELAYED_REF) {
+            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
-                if (insert_reserved) {
+                ret = run_delayed_tree_ref(trans, root, node, extent_op,
-                        struct btrfs_key ins;
+                                           insert_reserved);
+        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
-                        ins.objectid = node->bytenr;
+                 node->type == BTRFS_SHARED_DATA_REF_KEY)
-                        ins.offset = node->num_bytes;
+                ret = run_delayed_data_ref(trans, root, node, extent_op,
-                        ins.type = BTRFS_EXTENT_ITEM_KEY;
+                                           insert_reserved);
+        else
-                        /* record the full extent allocation */
+                BUG();
-                        ret = __btrfs_alloc_reserved_extent(trans, root,
+        return ret;
-                                        node->parent, ref->root,
-                                        ref->generation, ref->owner_objectid,
-                                        &ins, node->ref_mod);
-                        update_reserved_extents(root, node->bytenr,
-                                                node->num_bytes, 0);
-                } else {
-                        /* just add one backref */
-                        ret = add_extent_ref(trans, root, node->bytenr,
-                                     node->num_bytes,
-                                     node->parent, ref->root, ref->generation,
-                                     ref->owner_objectid, node->ref_mod);
-                }
-                BUG_ON(ret);
-        } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
-                WARN_ON(insert_reserved);
-                ret = drop_delayed_ref(trans, root, node);
-        }
-        return 0;
 }
 static noinline struct btrfs_delayed_ref_node *
@@ -919,7 +1763,7 @@ again:
                                rb_node);
                if (ref->bytenr != head->node.bytenr)
                        break;
-                if (btrfs_delayed_node_to_ref(ref)->action == action)
+                if (ref->action == action)
                        return ref;
                node = rb_prev(node);
        }
@@ -937,6 +1781,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct btrfs_delayed_ref_head *locked_ref = NULL;
+        struct btrfs_delayed_extent_op *extent_op;
        int ret;
        int count = 0;
        int must_insert_reserved = 0;
@@ -975,6 +1820,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                must_insert_reserved = locked_ref->must_insert_reserved;
                locked_ref->must_insert_reserved = 0;
+                extent_op = locked_ref->extent_op;
+                locked_ref->extent_op = NULL;
                /*
                 * locked_ref is the head node, so we have to go one
                 * node back for any delayed ref updates
@@ -986,6 +1834,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                         * so that any accounting fixes can happen
                         */
                        ref = &locked_ref->node;
+                        if (extent_op && must_insert_reserved) {
+                                kfree(extent_op);
+                                extent_op = NULL;
+                        }
+                        if (extent_op) {
+                                spin_unlock(&delayed_refs->lock);
+                                ret = run_delayed_extent_op(trans, root,
+                                                            ref, extent_op);
+                                BUG_ON(ret);
+                                kfree(extent_op);
+                                cond_resched();
+                                spin_lock(&delayed_refs->lock);
+                                continue;
+                        }
                        list_del_init(&locked_ref->cluster);
                        locked_ref = NULL;
                }
@@ -993,14 +1860,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
                spin_unlock(&delayed_refs->lock);
-                ret = run_one_delayed_ref(trans, root, ref,
+                ret = run_one_delayed_ref(trans, root, ref, extent_op,
                                          must_insert_reserved);
                BUG_ON(ret);
-                btrfs_put_delayed_ref(ref);
+                btrfs_put_delayed_ref(ref);
+                kfree(extent_op);
                count++;
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
@@ -1095,25 +1965,112 @@ out:
        return 0;
 }
-int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, u64 objectid, u64 bytenr)
+                                struct btrfs_root *root,
+                                u64 bytenr, u64 num_bytes, u64 flags,
+                                int is_data)
+{
+        struct btrfs_delayed_extent_op *extent_op;
+        int ret;
+        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+        if (!extent_op)
+                return -ENOMEM;
+        extent_op->flags_to_set = flags;
+        extent_op->update_flags = 1;
+        extent_op->update_key = 0;
+        extent_op->is_data = is_data ? 1 : 0;
+        ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+        if (ret)
+                kfree(extent_op);
+        return ret;
+}
+static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      u64 objectid, u64 offset, u64 bytenr)
+{
+        struct btrfs_delayed_ref_head *head;
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_data_ref *data_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct rb_node *node;
+        int ret = 0;
+        ret = -ENOENT;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (!head)
+                goto out;
+        if (!mutex_trylock(&head->mutex)) {
+                atomic_inc(&head->node.refs);
+                spin_unlock(&delayed_refs->lock);
+                btrfs_release_path(root->fs_info->extent_root, path);
+                mutex_lock(&head->mutex);
+                mutex_unlock(&head->mutex);
+                btrfs_put_delayed_ref(&head->node);
+                return -EAGAIN;
+        }
+        node = rb_prev(&head->node.rb_node);
+        if (!node)
+                goto out_unlock;
+        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+        if (ref->bytenr != bytenr)
+                goto out_unlock;
+        ret = 1;
+        if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
+                goto out_unlock;
+        data_ref = btrfs_delayed_node_to_data_ref(ref);
+        node = rb_prev(node);
+        if (node) {
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                if (ref->bytenr == bytenr)
+                        goto out_unlock;
+        }
+        if (data_ref->root != root->root_key.objectid ||
+            data_ref->objectid != objectid || data_ref->offset != offset)
+                goto out_unlock;
+        ret = 0;
+out_unlock:
+        mutex_unlock(&head->mutex);
+out:
+        spin_unlock(&delayed_refs->lock);
+        return ret;
+}
+static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_path *path,
+                                        u64 objectid, u64 offset, u64 bytenr)
 {
        struct btrfs_root *extent_root = root->fs_info->extent_root;
-        struct btrfs_path *path;
        struct extent_buffer *leaf;
-        struct btrfs_extent_ref *ref_item;
+        struct btrfs_extent_data_ref *ref;
+        struct btrfs_extent_inline_ref *iref;
+        struct btrfs_extent_item *ei;
        struct btrfs_key key;
-        struct btrfs_key found_key;
+        u32 item_size;
-        u64 ref_root;
-        u64 last_snapshot;
-        u32 nritems;
        int ret;
        key.objectid = bytenr;
        key.offset = (u64)-1;
        key.type = BTRFS_EXTENT_ITEM_KEY;
-        path = btrfs_alloc_path();
        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
@@ -1125,55 +2082,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
        path->slots[0]--;
        leaf = path->nodes[0];
-        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        if (found_key.objectid != bytenr ||
+        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
-            found_key.type != BTRFS_EXTENT_ITEM_KEY)
                goto out;
-        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        ret = 1;
-        while (1) {
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                leaf = path->nodes[0];
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                nritems = btrfs_header_nritems(leaf);
+        if (item_size < sizeof(*ei)) {
-                if (path->slots[0] >= nritems) {
+                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
-                        ret = btrfs_next_leaf(extent_root, path);
+                goto out;
-                        if (ret < 0)
+        }
-                                goto out;
+#endif
-                        if (ret == 0)
+        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
-                                continue;
-                        break;
-                }
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid != bytenr)
-                        break;
-                if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+        if (item_size != sizeof(*ei) +
-                        path->slots[0]++;
+            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
-                        continue;
+                goto out;
-                }
-                ref_item = btrfs_item_ptr(leaf, path->slots[0],
+        if (btrfs_extent_generation(leaf, ei) <=
-                                          struct btrfs_extent_ref);
+            btrfs_root_last_snapshot(&root->root_item))
-                ref_root = btrfs_ref_root(leaf, ref_item);
+                goto out;
-                if ((ref_root != root->root_key.objectid &&
-                     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
+        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-                     objectid != btrfs_ref_objectid(leaf, ref_item)) {
+        if (btrfs_extent_inline_ref_type(leaf, iref) !=
-                        ret = 1;
+            BTRFS_EXTENT_DATA_REF_KEY)
-                        goto out;
+                goto out;
-                }
-                if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
+        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
-                        ret = 1;
+        if (btrfs_extent_refs(leaf, ei) !=
+            btrfs_extent_data_ref_count(leaf, ref) ||
+            btrfs_extent_data_ref_root(leaf, ref) !=
+            root->root_key.objectid ||
+            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+            btrfs_extent_data_ref_offset(leaf, ref) != offset)
+                goto out;
+        ret = 0;
+out:
+        return ret;
+}
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          u64 objectid, u64 offset, u64 bytenr)
+{
+        struct btrfs_path *path;
+        int ret;
+        int ret2;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOENT;
+        do {
+                ret = check_committed_ref(trans, root, path, objectid,
+                                          offset, bytenr);
+                if (ret && ret != -ENOENT)
                        goto out;
-                }
-                path->slots[0]++;
+                ret2 = check_delayed_ref(trans, root, path, objectid,
+                                         offset, bytenr);
+        } while (ret2 == -EAGAIN);
+        if (ret2 && ret2 != -ENOENT) {
+                ret = ret2;
+                goto out;
        }
-        ret = 0;
+        if (ret != -ENOENT || ret2 != -ENOENT)
+                ret = 0;
 out:
        btrfs_free_path(path);
        return ret;
 }
+#if 0
 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                    struct extent_buffer *buf, u32 nr_extents)
 {
@@ -1291,62 +2276,44 @@ static int refsort_cmp(const void *a_void, const void *b_void)
                return 1;
        return 0;
 }
+#endif
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
-noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
-                           struct extent_buffer *orig_buf,
+                           struct extent_buffer *buf,
-                           struct extent_buffer *buf, u32 *nr_extents)
+                           int full_backref, int inc)
 {
        u64 bytenr;
+        u64 num_bytes;
+        u64 parent;
        u64 ref_root;
-        u64 orig_root;
-        u64 ref_generation;
-        u64 orig_generation;
-        struct refsort *sorted;
        u32 nritems;
-        u32 nr_file_extents = 0;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        int i;
        int level;
        int ret = 0;
-        int faili = 0;
-        int refi = 0;
-        int slot;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                            u64, u64, u64, u64, u64, u64, u64, u64, u64);
+                            u64, u64, u64, u64, u64, u64);
        ref_root = btrfs_header_owner(buf);
-        ref_generation = btrfs_header_generation(buf);
-        orig_root = btrfs_header_owner(orig_buf);
-        orig_generation = btrfs_header_generation(orig_buf);
        nritems = btrfs_header_nritems(buf);
        level = btrfs_header_level(buf);
-        sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
+        if (!root->ref_cows && level == 0)
-        BUG_ON(!sorted);
+                return 0;
-        if (root->ref_cows) {
+        if (inc)
-                process_func = __btrfs_inc_extent_ref;
+                process_func = btrfs_inc_extent_ref;
-        } else {
+        else
-                if (level == 0 &&
+                process_func = btrfs_free_extent;
-                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-                        goto out;
+        if (full_backref)
-                if (level != 0 &&
+                parent = buf->start;
-                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+        else
-                        goto out;
+                parent = 0;
-                process_func = __btrfs_update_extent_ref;
-        }
-        /*
-         * we make two passes through the items.  In the first pass we
-         * only record the byte number and slot.  Then we sort based on
-         * byte number and do the actual work based on the sorted results
-         */
        for (i = 0; i < nritems; i++) {
-                cond_resched();
                if (level == 0) {
                        btrfs_item_key_to_cpu(buf, &key, i);
                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -1360,151 +2327,38 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                        if (bytenr == 0)
                                continue;
-                        nr_file_extents++;
+                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
-                        sorted[refi].bytenr = bytenr;
+                        key.offset -= btrfs_file_extent_offset(buf, fi);
-                        sorted[refi].slot = i;
+                        ret = process_func(trans, root, bytenr, num_bytes,
-                        refi++;
+                                           parent, ref_root, key.objectid,
-                } else {
+                                           key.offset);
-                        bytenr = btrfs_node_blockptr(buf, i);
+                        if (ret)
-                        sorted[refi].bytenr = bytenr;
-                        sorted[refi].slot = i;
-                        refi++;
-                }
-        }
-        /*
-         * if refi == 0, we didn't actually put anything into the sorted
-         * array and we're done
-         */
-        if (refi == 0)
-                goto out;
-        sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-        for (i = 0; i < refi; i++) {
-                cond_resched();
-                slot = sorted[i].slot;
-                bytenr = sorted[i].bytenr;
-                if (level == 0) {
-                        btrfs_item_key_to_cpu(buf, &key, slot);
-                        fi = btrfs_item_ptr(buf, slot,
-                                            struct btrfs_file_extent_item);
-                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-                        if (bytenr == 0)
-                                continue;
-                        ret = process_func(trans, root, bytenr,
-                                   btrfs_file_extent_disk_num_bytes(buf, fi),
-                                   orig_buf->start, buf->start,
-                                   orig_root, ref_root,
-                                   orig_generation, ref_generation,
-                                   key.objectid);
-                        if (ret) {
-                                faili = slot;
-                                WARN_ON(1);
                                goto fail;
-                        }
                } else {
-                        ret = process_func(trans, root, bytenr, buf->len,
+                        bytenr = btrfs_node_blockptr(buf, i);
-                                           orig_buf->start, buf->start,
+                        num_bytes = btrfs_level_size(root, level - 1);
-                                           orig_root, ref_root,
+                        ret = process_func(trans, root, bytenr, num_bytes,
-                                           orig_generation, ref_generation,
+                                           parent, ref_root, level - 1, 0);
-                                           level - 1);
+                        if (ret)
-                        if (ret) {
-                                faili = slot;
-                                WARN_ON(1);
                                goto fail;
-                        }
                }
        }
-out:
-        kfree(sorted);
-        if (nr_extents) {
-                if (level == 0)
-                        *nr_extents = nr_file_extents;
-                else
-                        *nr_extents = nritems;
-        }
        return 0;
 fail:
-        kfree(sorted);
+        BUG();
-        WARN_ON(1);
        return ret;
 }
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                     struct btrfs_root *root, struct extent_buffer *orig_buf,
+                  struct extent_buffer *buf, int full_backref)
-                     struct extent_buffer *buf, int start_slot, int nr)
 {
-        u64 bytenr;
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
-        u64 ref_root;
+}
-        u64 orig_root;
-        u64 ref_generation;
-        u64 orig_generation;
-        struct btrfs_key key;
-        struct btrfs_file_extent_item *fi;
-        int i;
-        int ret;
-        int slot;
-        int level;
-        BUG_ON(start_slot < 0);
-        BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
-        ref_root = btrfs_header_owner(buf);
-        ref_generation = btrfs_header_generation(buf);
-        orig_root = btrfs_header_owner(orig_buf);
-        orig_generation = btrfs_header_generation(orig_buf);
-        level = btrfs_header_level(buf);
-        if (!root->ref_cows) {
-                if (level == 0 &&
-                    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-                        return 0;
-                if (level != 0 &&
-                    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
-                        return 0;
-        }
-        for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                cond_resched();
+                  struct extent_buffer *buf, int full_backref)
-                if (level == 0) {
+{
-                        btrfs_item_key_to_cpu(buf, &key, slot);
+        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
-                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                                continue;
-                        fi = btrfs_item_ptr(buf, slot,
-                                            struct btrfs_file_extent_item);
-                        if (btrfs_file_extent_type(buf, fi) ==
-                            BTRFS_FILE_EXTENT_INLINE)
-                                continue;
-                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-                        if (bytenr == 0)
-                                continue;
-                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                    btrfs_file_extent_disk_num_bytes(buf, fi),
-                                    orig_buf->start, buf->start,
-                                    orig_root, ref_root, orig_generation,
-                                    ref_generation, key.objectid);
-                        if (ret)
-                                goto fail;
-                } else {
-                        bytenr = btrfs_node_blockptr(buf, slot);
-                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                            buf->len, orig_buf->start,
-                                            buf->start, orig_root, ref_root,
-                                            orig_generation, ref_generation,
-                                            level - 1);
-                        if (ret)
-                                goto fail;
-                }
-        }
-        return 0;
-fail:
-        WARN_ON(1);
-        return -1;
 }
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2007,6 +2861,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
        u64 old_val;
        u64 byte_in_group;
+        /* block accounting for super block */
+        spin_lock(&info->delalloc_lock);
+        old_val = btrfs_super_bytes_used(&info->super_copy);
+        if (alloc)
+                old_val += num_bytes;
+        else
+                old_val -= num_bytes;
+        btrfs_set_super_bytes_used(&info->super_copy, old_val);
+        /* block accounting for root item */
+        old_val = btrfs_root_used(&root->root_item);
+        if (alloc)
+                old_val += num_bytes;
+        else
+                old_val -= num_bytes;
+        btrfs_set_root_used(&root->root_item, old_val);
+        spin_unlock(&info->delalloc_lock);
        while (total) {
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
@@ -2216,8 +3088,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
                u64 header_owner = btrfs_header_owner(buf);
                u64 header_transid = btrfs_header_generation(buf);
                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-                    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
-                    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
                    header_transid == trans->transid &&
                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                        *must_clean = buf;
@@ -2235,63 +3105,77 @@ pinit:
        return 0;
 }
-/*
- * remove an extent from the root, returns 0 on success
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- */
+                                struct btrfs_root *root,
-static int __free_extent(struct btrfs_trans_handle *trans,
+                                u64 bytenr, u64 num_bytes, u64 parent,
-                         struct btrfs_root *root,
+                                u64 root_objectid, u64 owner_objectid,
-                         u64 bytenr, u64 num_bytes, u64 parent,
+                                u64 owner_offset, int refs_to_drop,
-                         u64 root_objectid, u64 ref_generation,
+                                struct btrfs_delayed_extent_op *extent_op)
-                         u64 owner_objectid, int pin, int mark_free,
-                         int refs_to_drop)
 {
-        struct btrfs_path *path;
        struct btrfs_key key;
+        struct btrfs_path *path;
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_root *extent_root = info->extent_root;
        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *iref;
        int ret;
+        int is_data;
        int extent_slot = 0;
        int found_extent = 0;
        int num_to_del = 1;
-        struct btrfs_extent_item *ei;
+        u32 item_size;
-        u32 refs;
+        u64 refs;
-        key.objectid = bytenr;
-        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-        key.offset = num_bytes;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->reada = 1;
        path->leave_spinning = 1;
-        ret = lookup_extent_backref(trans, extent_root, path,
-                                    bytenr, parent, root_objectid,
+        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
-                                    ref_generation, owner_objectid, 1);
+        BUG_ON(!is_data && refs_to_drop != 1);
+        ret = lookup_extent_backref(trans, extent_root, path, &iref,
+                                    bytenr, num_bytes, parent,
+                                    root_objectid, owner_objectid,
+                                    owner_offset);
        if (ret == 0) {
-                struct btrfs_key found_key;
                extent_slot = path->slots[0];
-                while (extent_slot > 0) {
+                while (extent_slot >= 0) {
-                        extent_slot--;
+                        btrfs_item_key_to_cpu(path->nodes[0], &key,
-                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                              extent_slot);
-                        if (found_key.objectid != bytenr)
+                        if (key.objectid != bytenr)
                                break;
-                        if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
-                            found_key.offset == num_bytes) {
+                            key.offset == num_bytes) {
                                found_extent = 1;
                                break;
                        }
                        if (path->slots[0] - extent_slot > 5)
                                break;
+                        extent_slot--;
                }
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
+                if (found_extent && item_size < sizeof(*ei))
+                        found_extent = 0;
+#endif
                if (!found_extent) {
+                        BUG_ON(iref);
                        ret = remove_extent_backref(trans, extent_root, path,
-                                                    refs_to_drop);
+                                                    NULL, refs_to_drop,
+                                                    is_data);
                        BUG_ON(ret);
                        btrfs_release_path(extent_root, path);
                        path->leave_spinning = 1;
+                        key.objectid = bytenr;
+                        key.type = BTRFS_EXTENT_ITEM_KEY;
+                        key.offset = num_bytes;
                        ret = btrfs_search_slot(trans, extent_root,
                                                &key, path, -1, 1);
                        if (ret) {
@@ -2307,82 +3191,98 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_print_leaf(extent_root, path->nodes[0]);
                WARN_ON(1);
                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-                       "parent %llu root %llu gen %llu owner %llu\n",
+                       "parent %llu root %llu  owner %llu offset %llu\n",
                       (unsigned long long)bytenr,
                       (unsigned long long)parent,
                       (unsigned long long)root_objectid,
-                       (unsigned long long)ref_generation,
+                       (unsigned long long)owner_objectid,
-                       (unsigned long long)owner_objectid);
+                       (unsigned long long)owner_offset);
        }
        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, extent_slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                BUG_ON(found_extent || extent_slot != path->slots[0]);
+                ret = convert_extent_item_v0(trans, extent_root, path,
+                                             owner_objectid, 0);
+                BUG_ON(ret < 0);
+                btrfs_release_path(extent_root, path);
+                path->leave_spinning = 1;
+                key.objectid = bytenr;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = num_bytes;
+                ret = btrfs_search_slot(trans, extent_root, &key, path,
+                                        -1, 1);
+                if (ret) {
+                        printk(KERN_ERR "umm, got %d back from search"
+                               ", was looking for %llu\n", ret,
+                               (unsigned long long)bytenr);
+                        btrfs_print_leaf(extent_root, path->nodes[0]);
+                }
+                BUG_ON(ret);
+                extent_slot = path->slots[0];
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, extent_slot);
+        }
+#endif
+        BUG_ON(item_size < sizeof(*ei));
        ei = btrfs_item_ptr(leaf, extent_slot,
                            struct btrfs_extent_item);
-        refs = btrfs_extent_refs(leaf, ei);
+        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                struct btrfs_tree_block_info *bi;
-        /*
+                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
-         * we're not allowed to delete the extent item if there
+                bi = (struct btrfs_tree_block_info *)(ei + 1);
-         * are other delayed ref updates pending
+                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
-         */
+        }
+        refs = btrfs_extent_refs(leaf, ei);
        BUG_ON(refs < refs_to_drop);
        refs -= refs_to_drop;
-        btrfs_set_extent_refs(leaf, ei, refs);
-        btrfs_mark_buffer_dirty(leaf);
-        if (refs == 0 && found_extent &&
+        if (refs > 0) {
-            path->slots[0] == extent_slot + 1) {
+                if (extent_op)
-                struct btrfs_extent_ref *ref;
+                        __run_delayed_extent_op(extent_op, leaf, ei);
-                ref = btrfs_item_ptr(leaf, path->slots[0],
+                /*
-                                     struct btrfs_extent_ref);
+                 * In the case of inline back ref, reference count will
-                BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
+                 * be updated by remove_extent_backref
-                /* if the back ref and the extent are next to each other
-                 * they get deleted below in one shot
                 */
-                path->slots[0] = extent_slot;
+                if (iref) {
-                num_to_del = 2;
+                        BUG_ON(!found_extent);
-        } else if (found_extent) {
+                } else {
-                /* otherwise delete the extent back ref */
+                        btrfs_set_extent_refs(leaf, ei, refs);
-                ret = remove_extent_backref(trans, extent_root, path,
+                        btrfs_mark_buffer_dirty(leaf);
-                                            refs_to_drop);
+                }
-                BUG_ON(ret);
+                if (found_extent) {
-                /* if refs are 0, we need to setup the path for deletion */
+                        ret = remove_extent_backref(trans, extent_root, path,
-                if (refs == 0) {
+                                                    iref, refs_to_drop,
-                        btrfs_release_path(extent_root, path);
+                                                    is_data);
-                        path->leave_spinning = 1;
-                        ret = btrfs_search_slot(trans, extent_root, &key, path,
-                                                -1, 1);
                        BUG_ON(ret);
                }
-        }
+        } else {
+                int mark_free = 0;
-        if (refs == 0) {
-                u64 super_used;
-                u64 root_used;
                struct extent_buffer *must_clean = NULL;
-                if (pin) {
+                if (found_extent) {
-                        ret = pin_down_bytes(trans, root, path,
+                        BUG_ON(is_data && refs_to_drop !=
-                                bytenr, num_bytes,
+                               extent_data_ref_count(root, path, iref));
-                                owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+                        if (iref) {
-                                &must_clean);
+                                BUG_ON(path->slots[0] != extent_slot);
-                        if (ret > 0)
+                        } else {
-                                mark_free = 1;
+                                BUG_ON(path->slots[0] != extent_slot + 1);
-                        BUG_ON(ret < 0);
+                                path->slots[0] = extent_slot;
+                                num_to_del = 2;
+                        }
                }
-                /* block accounting for super block */
+                ret = pin_down_bytes(trans, root, path, bytenr,
-                spin_lock(&info->delalloc_lock);
+                                     num_bytes, is_data, &must_clean);
-                super_used = btrfs_super_bytes_used(&info->super_copy);
+                if (ret > 0)
-                btrfs_set_super_bytes_used(&info->super_copy,
+                        mark_free = 1;
-                                           super_used - num_bytes);
+                BUG_ON(ret < 0);
-                /* block accounting for root item */
-                root_used = btrfs_root_used(&root->root_item);
-                btrfs_set_root_used(&root->root_item,
-                                           root_used - num_bytes);
-                spin_unlock(&info->delalloc_lock);
                /*
                 * it is going to be very rare for someone to be waiting
                 * on the block we're freeing.  del_items might need to
@@ -2403,7 +3303,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                        free_extent_buffer(must_clean);
                }
-                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
                } else {
@@ -2421,34 +3321,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 }
 /*
- * remove an extent from the root, returns 0 on success
- */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        u64 bytenr, u64 num_bytes, u64 parent,
-                                        u64 root_objectid, u64 ref_generation,
-                                        u64 owner_objectid, int pin,
-                                        int refs_to_drop)
-{
-        WARN_ON(num_bytes < root->sectorsize);
-        /*
-         * if metadata always pin
-         * if data pin when any transaction has committed this
-         */
-        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
-            ref_generation != trans->transid)
-                pin = 1;
-        if (ref_generation != trans->transid)
-                pin = 1;
-        return __free_extent(trans, root, bytenr, num_bytes, parent,
-                            root_objectid, ref_generation,
-                            owner_objectid, pin, pin == 0, refs_to_drop);
-}
-/*
 * when we free an extent, it is possible (and likely) that we free the last
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
@@ -2479,6 +3351,13 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        if (ref->bytenr == bytenr)
                goto out;
+        if (head->extent_op) {
+                if (!head->must_insert_reserved)
+                        goto out;
+                kfree(head->extent_op);
+                head->extent_op = NULL;
+        }
        /*
         * waiting for the lock here would deadlock.  If someone else has it
         * locked they are already in the process of dropping it anyway
@@ -2507,7 +3386,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        spin_unlock(&delayed_refs->lock);
        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-                                  &head->node, head->must_insert_reserved);
+                                  &head->node, head->extent_op,
+                                  head->must_insert_reserved);
        BUG_ON(ret);
        btrfs_put_delayed_ref(&head->node);
        return 0;
@@ -2519,32 +3399,32 @@ out:
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
-                      u64 root_objectid, u64 ref_generation,
+                      u64 root_objectid, u64 owner, u64 offset)
-                      u64 owner_objectid, int pin)
 {
        int ret;
        /*
         * tree log blocks never actually go into the extent allocation
         * tree, just update pinning info and exit early.
-         *
-         * data extents referenced by the tree log do need to have
-         * their reference counts bumped.
         */
-        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
-            owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
                /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
                update_reserved_extents(root, bytenr, num_bytes, 0);
                ret = 0;
-        } else {
+        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-                ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
-                                       root_objectid, ref_generation,
+                                        parent, root_objectid, (int)owner,
-                                       owner_objectid,
+                                        BTRFS_DROP_DELAYED_REF, NULL);
-                                       BTRFS_DROP_DELAYED_REF, 1);
                BUG_ON(ret);
                ret = check_ref_cleanup(trans, root, bytenr);
                BUG_ON(ret);
+        } else {
+                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+                                        parent, root_objectid, owner,
+                                        offset, BTRFS_DROP_DELAYED_REF, NULL);
+                BUG_ON(ret);
        }
        return ret;
 }
@@ -2719,7 +3599,7 @@ refill_cluster:
                        last_ptr_loop = 0;
                        /* allocate a cluster in this block group */
-                        ret = btrfs_find_space_cluster(trans,
+                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
                                               offset, num_bytes,
                                               empty_cluster + empty_size);
@@ -2969,99 +3849,147 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root, u64 parent,
+                                      struct btrfs_root *root,
-                                         u64 root_objectid, u64 ref_generation,
+                                      u64 parent, u64 root_objectid,
-                                         u64 owner, struct btrfs_key *ins,
+                                      u64 flags, u64 owner, u64 offset,
-                                         int ref_mod)
+                                      struct btrfs_key *ins, int ref_mod)
 {
        int ret;
-        u64 super_used;
+        struct btrfs_fs_info *fs_info = root->fs_info;
-        u64 root_used;
-        u64 num_bytes = ins->offset;
-        u32 sizes[2];
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_root *extent_root = info->extent_root;
        struct btrfs_extent_item *extent_item;
-        struct btrfs_extent_ref *ref;
+        struct btrfs_extent_inline_ref *iref;
        struct btrfs_path *path;
-        struct btrfs_key keys[2];
+        struct extent_buffer *leaf;
+        int type;
-        if (parent == 0)
+        u32 size;
-                parent = ins->objectid;
-        /* block accounting for super block */
-        spin_lock(&info->delalloc_lock);
-        super_used = btrfs_super_bytes_used(&info->super_copy);
-        btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
-        /* block accounting for root item */
+        if (parent > 0)
-        root_used = btrfs_root_used(&root->root_item);
+                type = BTRFS_SHARED_DATA_REF_KEY;
-        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
+        else
-        spin_unlock(&info->delalloc_lock);
+                type = BTRFS_EXTENT_DATA_REF_KEY;
-        memcpy(&keys[0], ins, sizeof(*ins));
+        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
-        keys[1].objectid = ins->objectid;
-        keys[1].type = BTRFS_EXTENT_REF_KEY;
-        keys[1].offset = parent;
-        sizes[0] = sizeof(*extent_item);
-        sizes[1] = sizeof(*ref);
        path = btrfs_alloc_path();
        BUG_ON(!path);
        path->leave_spinning = 1;
-        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
+        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-                                       sizes, 2);
+                                      ins, size);
        BUG_ON(ret);
-        extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+        leaf = path->nodes[0];
+        extent_item = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_item);
-        btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
+        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
-        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
-                             struct btrfs_extent_ref);
+        btrfs_set_extent_flags(leaf, extent_item,
+                               flags | BTRFS_EXTENT_FLAG_DATA);
-        btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
-        btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
+        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
-        btrfs_set_ref_objectid(path->nodes[0], ref, owner);
+        btrfs_set_extent_inline_ref_type(leaf, iref, type);
-        btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
+        if (parent > 0) {
+                struct btrfs_shared_data_ref *ref;
+                ref = (struct btrfs_shared_data_ref *)(iref + 1);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+        } else {
+                struct btrfs_extent_data_ref *ref;
+                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        trans->alloc_exclude_start = 0;
-        trans->alloc_exclude_nr = 0;
        btrfs_free_path(path);
-        if (ret)
+        ret = update_block_group(trans, root, ins->objectid, ins->offset,
-                goto out;
+                                 1, 0);
-        ret = update_block_group(trans, root, ins->objectid,
-                                 ins->offset, 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
                       (unsigned long long)ins->offset);
                BUG();
        }
-out:
        return ret;
 }
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, u64 parent,
+                                     struct btrfs_root *root,
-                                u64 root_objectid, u64 ref_generation,
+                                     u64 parent, u64 root_objectid,
-                                u64 owner, struct btrfs_key *ins)
+                                     u64 flags, struct btrfs_disk_key *key,
+                                     int level, struct btrfs_key *ins)
 {
        int ret;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_extent_item *extent_item;
+        struct btrfs_tree_block_info *block_info;
+        struct btrfs_extent_inline_ref *iref;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
-        if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
+        path = btrfs_alloc_path();
-                return 0;
+        BUG_ON(!path);
-        ret = btrfs_add_delayed_ref(trans, ins->objectid,
+        path->leave_spinning = 1;
-                                    ins->offset, parent, root_objectid,
+        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
-                                    ref_generation, owner,
+                                      ins, size);
-                                    BTRFS_ADD_DELAYED_EXTENT, 0);
        BUG_ON(ret);
+        leaf = path->nodes[0];
+        extent_item = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_extent_item);
+        btrfs_set_extent_refs(leaf, extent_item, 1);
+        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+        btrfs_set_extent_flags(leaf, extent_item,
+                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+        block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+        btrfs_set_tree_block_key(leaf, block_info, key);
+        btrfs_set_tree_block_level(leaf, block_info, level);
+        iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+        if (parent > 0) {
+                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+                btrfs_set_extent_inline_ref_type(leaf, iref,
+                                                 BTRFS_SHARED_BLOCK_REF_KEY);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+        } else {
+                btrfs_set_extent_inline_ref_type(leaf, iref,
+                                                 BTRFS_TREE_BLOCK_REF_KEY);
+                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+        }
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_free_path(path);
+        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+                                 1, 0);
+        if (ret) {
+                printk(KERN_ERR "btrfs update block group failed for %llu "
+                       "%llu\n", (unsigned long long)ins->objectid,
+                       (unsigned long long)ins->offset);
+                BUG();
+        }
+        return ret;
+}
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     u64 root_objectid, u64 owner,
+                                     u64 offset, struct btrfs_key *ins)
+{
+        int ret;
+        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+        ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
+                                         0, root_objectid, owner, offset,
+                                         BTRFS_ADD_DELAYED_EXTENT, NULL);
        return ret;
 }
@@ -3070,10 +3998,10 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root, u64 parent,
+                                   struct btrfs_root *root,
-                                u64 root_objectid, u64 ref_generation,
+                                   u64 root_objectid, u64 owner, u64 offset,
-                                u64 owner, struct btrfs_key *ins)
+                                   struct btrfs_key *ins)
 {
        int ret;
        struct btrfs_block_group_cache *block_group;
@@ -3087,8 +4015,8 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
                                      ins->offset);
        BUG_ON(ret);
        btrfs_put_block_group(block_group);
-        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
-                                            ref_generation, owner, ins, 1);
+                                         0, owner, offset, ins, 1);
        return ret;
 }
@@ -3099,26 +4027,48 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 *
 * returns 0 if everything worked, non-zero otherwise.
 */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+static int alloc_tree_block(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
+                            struct btrfs_root *root,
-                       u64 num_bytes, u64 parent, u64 min_alloc_size,
+                            u64 num_bytes, u64 parent, u64 root_objectid,
-                       u64 root_objectid, u64 ref_generation,
+                            struct btrfs_disk_key *key, int level,
-                       u64 owner_objectid, u64 empty_size, u64 hint_byte,
+                            u64 empty_size, u64 hint_byte, u64 search_end,
-                       u64 search_end, struct btrfs_key *ins, u64 data)
+                            struct btrfs_key *ins)
 {
        int ret;
-        ret = __btrfs_reserve_extent(trans, root, num_bytes,
+        u64 flags = 0;
-                                     min_alloc_size, empty_size, hint_byte,
-                                     search_end, ins, data);
+        ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+                                     empty_size, hint_byte, search_end,
+                                     ins, 0);
        BUG_ON(ret);
+        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent == 0)
+                        parent = ins->objectid;
+                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        } else
+                BUG_ON(parent > 0);
+        update_reserved_extents(root, ins->objectid, ins->offset, 1);
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                ret = btrfs_add_delayed_ref(trans, ins->objectid,
+                struct btrfs_delayed_extent_op *extent_op;
-                                            ins->offset, parent, root_objectid,
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                                            ref_generation, owner_objectid,
+                BUG_ON(!extent_op);
-                                            BTRFS_ADD_DELAYED_EXTENT, 0);
+                if (key)
+                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
+                else
+                        memset(&extent_op->key, 0, sizeof(extent_op->key));
+                extent_op->flags_to_set = flags;
+                extent_op->update_key = 1;
+                extent_op->update_flags = 1;
+                extent_op->is_data = 0;
+                ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
+                                        ins->offset, parent, root_objectid,
+                                        level, BTRFS_ADD_DELAYED_EXTENT,
+                                        extent_op);
                BUG_ON(ret);
        }
-        update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
@@ -3157,21 +4107,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 * returns the tree buffer or NULL.
 */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-                                             struct btrfs_root *root,
+                                        struct btrfs_root *root, u32 blocksize,
-                                             u32 blocksize, u64 parent,
+                                        u64 parent, u64 root_objectid,
-                                             u64 root_objectid,
+                                        struct btrfs_disk_key *key, int level,
-                                             u64 ref_generation,
+                                        u64 hint, u64 empty_size)
-                                             int level,
-                                             u64 hint,
-                                             u64 empty_size)
 {
        struct btrfs_key ins;
        int ret;
        struct extent_buffer *buf;
-        ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+        ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
-                                 root_objectid, ref_generation, level,
+                               key, level, empty_size, hint, (u64)-1, &ins);
-                                 empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
                BUG_ON(ret > 0);
                return ERR_PTR(ret);
@@ -3185,32 +4131,19 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf)
 {
-        u64 leaf_owner;
+        u64 disk_bytenr;
-        u64 leaf_generation;
+        u64 num_bytes;
-        struct refsort *sorted;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
+        u32 nritems;
        int i;
-        int nritems;
        int ret;
-        int refi = 0;
-        int slot;
        BUG_ON(!btrfs_is_leaf(leaf));
        nritems = btrfs_header_nritems(leaf);
-        leaf_owner = btrfs_header_owner(leaf);
-        leaf_generation = btrfs_header_generation(leaf);
-        sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
-        /* we do this loop twice.  The first time we build a list
-         * of the extents we have a reference on, then we sort the list
-         * by bytenr.  The second time around we actually do the
-         * extent freeing.
-         */
        for (i = 0; i < nritems; i++) {
-                u64 disk_bytenr;
                cond_resched();
                btrfs_item_key_to_cpu(leaf, &key, i);
                /* only extents have references, skip everything else */
@@ -3230,45 +4163,16 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                if (disk_bytenr == 0)
                        continue;
-                sorted[refi].bytenr = disk_bytenr;
+                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-                sorted[refi].slot = i;
+                ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
-                refi++;
+                                        leaf->start, 0, key.objectid, 0);
-        }
-        if (refi == 0)
-                goto out;
-        sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-        for (i = 0; i < refi; i++) {
-                u64 disk_bytenr;
-                disk_bytenr = sorted[i].bytenr;
-                slot = sorted[i].slot;
-                cond_resched();
-                btrfs_item_key_to_cpu(leaf, &key, slot);
-                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-                ret = btrfs_free_extent(trans, root, disk_bytenr,
-                                btrfs_file_extent_disk_num_bytes(leaf, fi),
-                                leaf->start, leaf_owner, leaf_generation,
-                                key.objectid, 0);
                BUG_ON(ret);
-                atomic_inc(&root->fs_info->throttle_gen);
-                wake_up(&root->fs_info->transaction_throttle);
-                cond_resched();
        }
-out:
-        kfree(sorted);
        return 0;
 }
+#if 0
 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root,
                                        struct btrfs_leaf_ref *ref)
@@ -3311,13 +4215,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, u64 start,
                                     u64 len, u32 *refs)
 {
        int ret;
-        ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
+        ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
        BUG_ON(ret);
 #if 0 /* some debugging code in case we see problems here */
@@ -3352,6 +4257,7 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
        return ret;
 }
 /*
 * this is used while deleting old snapshots, and it drops the refs
 * on a whole subtree starting from a level 1 node.
@@ -3645,32 +4551,36 @@ out:
        cond_resched();
        return 0;
 }
+#endif
 /*
 * helper function for drop_subtree, this function is similar to
 * walk_down_tree. The main difference is that it checks reference
 * counts while tree blocks are locked.
 */
-static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
-                                      struct btrfs_root *root,
+                                   struct btrfs_root *root,
-                                      struct btrfs_path *path, int *level)
+                                   struct btrfs_path *path, int *level)
 {
        struct extent_buffer *next;
        struct extent_buffer *cur;
        struct extent_buffer *parent;
        u64 bytenr;
        u64 ptr_gen;
+        u64 refs;
+        u64 flags;
        u32 blocksize;
-        u32 refs;
        int ret;
        cur = path->nodes[*level];
-        ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+        ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
-                                      &refs);
+                                       &refs, &flags);
        BUG_ON(ret);
        if (refs > 1)
                goto out;
+        BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
        while (*level >= 0) {
                cur = path->nodes[*level];
                if (*level == 0) {
@@ -3692,16 +4602,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(next);
                btrfs_set_lock_blocking(next);
-                ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+                ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-                                              &refs);
+                                               &refs, &flags);
                BUG_ON(ret);
                if (refs > 1) {
                        parent = path->nodes[*level];
                        ret = btrfs_free_extent(trans, root, bytenr,
-                                        blocksize, parent->start,
+                                                blocksize, parent->start,
-                                        btrfs_header_owner(parent),
+                                                btrfs_header_owner(parent),
-                                        btrfs_header_generation(parent),
+                                                *level - 1, 0);
-                                        *level - 1, 1);
                        BUG_ON(ret);
                        path->slots[*level]++;
                        btrfs_tree_unlock(next);
@@ -3709,6 +4618,8 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
                        continue;
                }
+                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
                *level = btrfs_header_level(next);
                path->nodes[*level] = next;
                path->slots[*level] = 0;
@@ -3716,13 +4627,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
                cond_resched();
        }
 out:
-        parent = path->nodes[*level + 1];
+        if (path->nodes[*level] == root->node)
+                parent = path->nodes[*level];
+        else
+                parent = path->nodes[*level + 1];
        bytenr = path->nodes[*level]->start;
        blocksize = path->nodes[*level]->len;
-        ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+        ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
-                        parent->start, btrfs_header_owner(parent),
+                                btrfs_header_owner(parent), *level, 0);
-                        btrfs_header_generation(parent), *level, 1);
        BUG_ON(ret);
        if (path->locks[*level]) {
@@ -3746,8 +4659,6 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 int *level, int max_level)
 {
-        u64 root_owner;
-        u64 root_gen;
        struct btrfs_root_item *root_item = &root->root_item;
        int i;
        int slot;
@@ -3755,24 +4666,22 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
        for (i = *level; i < max_level && path->nodes[i]; i++) {
                slot = path->slots[i];
-                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
-                        struct extent_buffer *node;
-                        struct btrfs_disk_key disk_key;
                        /*
                         * there is more work to do in this level.
                         * Update the drop_progress marker to reflect
                         * the work we've done so far, and then bump
                         * the slot number
                         */
-                        node = path->nodes[i];
                        path->slots[i]++;
-                        *level = i;
                        WARN_ON(*level == 0);
-                        btrfs_node_key(node, &disk_key, path->slots[i]);
+                        if (max_level == BTRFS_MAX_LEVEL) {
-                        memcpy(&root_item->drop_progress,
+                                btrfs_node_key(path->nodes[i],
-                               &disk_key, sizeof(disk_key));
+                                               &root_item->drop_progress,
-                        root_item->drop_level = i;
+                                               path->slots[i]);
+                                root_item->drop_level = i;
+                        }
+                        *level = i;
                        return 0;
                } else {
                        struct extent_buffer *parent;
@@ -3786,22 +4695,20 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                        else
                                parent = path->nodes[*level + 1];
-                        root_owner = btrfs_header_owner(parent);
+                        clean_tree_block(trans, root, path->nodes[i]);
-                        root_gen = btrfs_header_generation(parent);
-                        clean_tree_block(trans, root, path->nodes[*level]);
                        ret = btrfs_free_extent(trans, root,
-                                                path->nodes[*level]->start,
+                                                path->nodes[i]->start,
-                                                path->nodes[*level]->len,
+                                                path->nodes[i]->len,
-                                                parent->start, root_owner,
+                                                parent->start,
-                                                root_gen, *level, 1);
+                                                btrfs_header_owner(parent),
+                                                *level, 0);
                        BUG_ON(ret);
                        if (path->locks[*level]) {
-                                btrfs_tree_unlock(path->nodes[*level]);
+                                btrfs_tree_unlock(path->nodes[i]);
-                                path->locks[*level] = 0;
+                                path->locks[i] = 0;
                        }
-                        free_extent_buffer(path->nodes[*level]);
+                        free_extent_buffer(path->nodes[i]);
-                        path->nodes[*level] = NULL;
+                        path->nodes[i] = NULL;
                        *level = i + 1;
                }
        }
@@ -3820,21 +4727,18 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
        int wret;
        int level;
        struct btrfs_path *path;
-        int i;
-        int orig_level;
        int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
-        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
        path = btrfs_alloc_path();
        BUG_ON(!path);
        level = btrfs_header_level(root->node);
-        orig_level = level;
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
-                path->nodes[level] = root->node;
+                path->nodes[level] = btrfs_lock_root_node(root);
-                extent_buffer_get(root->node);
+                btrfs_set_lock_blocking(path->nodes[level]);
                path->slots[level] = 0;
+                path->locks[level] = 1;
        } else {
                struct btrfs_key key;
                struct btrfs_disk_key found_key;
@@ -3856,12 +4760,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                 * unlock our path, this is safe because only this
                 * function is allowed to delete this snapshot
                 */
-                for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+                btrfs_unlock_up_safe(path, 0);
-                        if (path->nodes[i] && path->locks[i]) {
-                                path->locks[i] = 0;
-                                btrfs_tree_unlock(path->nodes[i]);
-                        }
-                }
        }
        while (1) {
                unsigned long update;
@@ -3882,8 +4781,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                        ret = -EAGAIN;
                        break;
                }
-                atomic_inc(&root->fs_info->throttle_gen);
-                wake_up(&root->fs_info->transaction_throttle);
                for (update_count = 0; update_count < 16; update_count++) {
                        update = trans->delayed_ref_updates;
                        trans->delayed_ref_updates = 0;
@@ -3893,12 +4790,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                                break;
                }
        }
-        for (i = 0; i <= orig_level; i++) {
-                if (path->nodes[i]) {
-                        free_extent_buffer(path->nodes[i]);
-                        path->nodes[i] = NULL;
-                }
-        }
 out:
        btrfs_free_path(path);
        return ret;
@@ -3931,7 +4822,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        path->slots[level] = 0;
        while (1) {
-                wret = walk_down_subtree(trans, root, path, &level);
+                wret = walk_down_tree(trans, root, path, &level);
                if (wret < 0)
                        ret = wret;
                if (wret != 0)
@@ -3948,6 +4839,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        return ret;
 }
+#if 0
 static unsigned long calc_ra(unsigned long start, unsigned long last,
                             unsigned long nr)
 {
@@ -5429,6 +6321,7 @@ out:
        kfree(ref_path);
        return ret;
 }
+#endif
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
@@ -5477,7 +6370,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
        u64 calc;
        spin_lock(&shrink_block_group->lock);
-        if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+        if (btrfs_block_group_used(&shrink_block_group->item) +
+            shrink_block_group->reserved > 0) {
                spin_unlock(&shrink_block_group->lock);
                trans = btrfs_start_transaction(root, 1);
@@ -5502,6 +6396,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
        return 0;
 }
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+                                         struct btrfs_block_group_cache *group)
+{
+        __alloc_chunk_for_shrink(root, group, 1);
+        set_block_group_readonly(group);
+        return 0;
+}
+#if 0
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 u64 objectid, u64 size)
@@ -5781,6 +6686,7 @@ out:
        btrfs_free_path(path);
        return ret;
 }
+#endif
 static int find_first_block_group(struct btrfs_root *root,
                struct btrfs_path *path, struct btrfs_key *key)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fe9eb990e443..68260180f587 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -476,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
        struct rb_node *node;
+        u64 last_end;
        int err;
        int set = 0;
@@ -498,6 +499,7 @@ again:
        if (state->start > end)
                goto out;
        WARN_ON(state->end < start);
+        last_end = state->end;
        /*
         *     | ---- desired range ---- |
@@ -524,9 +526,11 @@ again:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        start = state->end + 1;
                        set |= clear_state_bit(tree, state, bits,
                                        wake, delete);
+                        if (last_end == (u64)-1)
+                                goto out;
+                        start = last_end + 1;
                } else {
                        start = state->start;
                }
@@ -552,8 +556,10 @@ again:
                goto out;
        }
-        start = state->end + 1;
        set |= clear_state_bit(tree, state, bits, wake, delete);
+        if (last_end == (u64)-1)
+                goto out;
+        start = last_end + 1;
        goto search_again;
 out:
@@ -707,8 +713,10 @@ again:
                        goto out;
                }
                set_state_bits(tree, state, bits);
-                start = state->end + 1;
                merge_state(tree, state);
+                if (last_end == (u64)-1)
+                        goto out;
+                start = last_end + 1;
                goto search_again;
        }
@@ -742,8 +750,10 @@ again:
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, bits);
-                        start = state->end + 1;
                        merge_state(tree, state);
+                        if (last_end == (u64)-1)
+                                goto out;
+                        start = last_end + 1;
                } else {
                        start = state->start;
                }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1d51dc38bb49..126477eaecf5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -291,16 +291,12 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 {
        u64 extent_end = 0;
        u64 search_start = start;
-        u64 leaf_start;
        u64 ram_bytes = 0;
-        u64 orig_parent = 0;
        u64 disk_bytenr = 0;
        u64 orig_locked_end = locked_end;
        u8 compression;
        u8 encryption;
        u16 other_encoding = 0;
-        u64 root_gen;
-        u64 root_owner;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *extent;
        struct btrfs_path *path;
@@ -340,9 +336,6 @@ next_slot:
                bookend = 0;
                found_extent = 0;
                found_inline = 0;
-                leaf_start = 0;
-                root_gen = 0;
-                root_owner = 0;
                compression = 0;
                encryption = 0;
                extent = NULL;
@@ -417,9 +410,6 @@ next_slot:
                if (found_extent) {
                        read_extent_buffer(leaf, &old, (unsigned long)extent,
                                           sizeof(old));
-                        root_gen = btrfs_header_generation(leaf);
-                        root_owner = btrfs_header_owner(leaf);
-                        leaf_start = leaf->start;
                }
                if (end < extent_end && end >= key.offset) {
@@ -443,14 +433,14 @@ next_slot:
                                }
                                locked_end = extent_end;
                        }
-                        orig_parent = path->nodes[0]->start;
                        disk_bytenr = le64_to_cpu(old.disk_bytenr);
                        if (disk_bytenr != 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
                                           disk_bytenr,
-                                           le64_to_cpu(old.disk_num_bytes),
+                                           le64_to_cpu(old.disk_num_bytes), 0,
-                                           orig_parent, root->root_key.objectid,
+                                           root->root_key.objectid,
-                                           trans->transid, inode->i_ino);
+                                           key.objectid, key.offset -
+                                           le64_to_cpu(old.offset));
                                BUG_ON(ret);
                        }
                }
@@ -568,17 +558,6 @@ next_slot:
                        btrfs_mark_buffer_dirty(path->nodes[0]);
                        btrfs_set_lock_blocking(path->nodes[0]);
-                        if (disk_bytenr != 0) {
-                                ret = btrfs_update_extent_ref(trans, root,
-                                                disk_bytenr,
-                                                le64_to_cpu(old.disk_num_bytes),
-                                                orig_parent,
-                                                leaf->start,
-                                                root->root_key.objectid,
-                                                trans->transid, ins.objectid);
-                                BUG_ON(ret);
-                        }
                        path->leave_spinning = 0;
                        btrfs_release_path(root, path);
                        if (disk_bytenr != 0)
@@ -594,8 +573,9 @@ next_slot:
                                ret = btrfs_free_extent(trans, root,
                                                old_disk_bytenr,
                                                le64_to_cpu(old.disk_num_bytes),
-                                                leaf_start, root_owner,
+                                                0, root->root_key.objectid,
-                                                root_gen, key.objectid, 0);
+                                                key.objectid, key.offset -
+                                                le64_to_cpu(old.offset));
                                BUG_ON(ret);
                                *hint_byte = old_disk_bytenr;
                        }
@@ -664,12 +644,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        u64 bytenr;
        u64 num_bytes;
        u64 extent_end;
-        u64 extent_offset;
+        u64 orig_offset;
        u64 other_start;
        u64 other_end;
        u64 split = start;
        u64 locked_end = end;
-        u64 orig_parent;
        int extent_type;
        int split_end = 1;
        int ret;
@@ -703,7 +682,7 @@ again:
        bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-        extent_offset = btrfs_file_extent_offset(leaf, fi);
+        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
        if (key.offset == start)
                split = end;
@@ -711,8 +690,6 @@ again:
        if (key.offset == start && extent_end == end) {
                int del_nr = 0;
                int del_slot = 0;
-                u64 leaf_owner = btrfs_header_owner(leaf);
-                u64 leaf_gen = btrfs_header_generation(leaf);
                other_start = end;
                other_end = 0;
                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
@@ -721,8 +698,8 @@ again:
                        del_slot = path->slots[0] + 1;
                        del_nr++;
                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-                                                leaf->start, leaf_owner,
+                                                0, root->root_key.objectid,
-                                                leaf_gen, inode->i_ino, 0);
+                                                inode->i_ino, orig_offset);
                        BUG_ON(ret);
                }
                other_start = 0;
@@ -733,8 +710,8 @@ again:
                        del_slot = path->slots[0];
                        del_nr++;
                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-                                                leaf->start, leaf_owner,
+                                                0, root->root_key.objectid,
-                                                leaf_gen, inode->i_ino, 0);
+                                                inode->i_ino, orig_offset);
                        BUG_ON(ret);
                }
                split_end = 0;
@@ -768,13 +745,12 @@ again:
                        locked_end = extent_end;
                }
                btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
-                extent_offset += split - key.offset;
        } else  {
                BUG_ON(key.offset != start);
-                btrfs_set_file_extent_offset(leaf, fi, extent_offset +
-                                             split - key.offset);
-                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
                key.offset = split;
+                btrfs_set_file_extent_offset(leaf, fi, key.offset -
+                                             orig_offset);
+                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
                btrfs_set_item_key_safe(trans, root, path, &key);
                extent_end = split;
        }
@@ -793,7 +769,8 @@ again:
                                            struct btrfs_file_extent_item);
                        key.offset = split;
                        btrfs_set_item_key_safe(trans, root, path, &key);
-                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+                        btrfs_set_file_extent_offset(leaf, fi, key.offset -
+                                                     orig_offset);
                        btrfs_set_file_extent_num_bytes(leaf, fi,
                                                        other_end - split);
                        goto done;
@@ -815,10 +792,9 @@ again:
        btrfs_mark_buffer_dirty(leaf);
-        orig_parent = leaf->start;
+        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
-        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+                                   root->root_key.objectid,
-                                   orig_parent, root->root_key.objectid,
+                                   inode->i_ino, orig_offset);
-                                   trans->transid, inode->i_ino);
        BUG_ON(ret);
        btrfs_release_path(root, path);
@@ -833,20 +809,12 @@ again:
        btrfs_set_file_extent_type(leaf, fi, extent_type);
        btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
        btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
-        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+        btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
        btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
        btrfs_set_file_extent_compression(leaf, fi, 0);
        btrfs_set_file_extent_encryption(leaf, fi, 0);
        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
-        if (orig_parent != leaf->start) {
-                ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-                                              orig_parent, leaf->start,
-                                              root->root_key.objectid,
-                                              trans->transid, inode->i_ino);
-                BUG_ON(ret);
-        }
 done:
        btrfs_mark_buffer_dirty(leaf);
@@ -1189,6 +1157,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
        root->log_batch++;
+        if (datasync && !(inode->i_state & I_DIRTY_PAGES))
+                goto out;
        /*
         * ok we haven't committed the transaction yet, lets do a commit
         */
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0bc93657b460..4538e48581a5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -579,6 +579,7 @@ out:
 * it returns -enospc
 */
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster,
                             u64 offset, u64 bytes, u64 empty_size)
@@ -595,7 +596,9 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        int ret;
        /* for metadata, allow allocates with more holes */
-        if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+        if (btrfs_test_opt(root, SSD_SPREAD)) {
+                min_bytes = bytes + empty_size;
+        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
                /*
                 * we want to do larger allocations when we are
                 * flushing out the delayed refs, it helps prevent
@@ -645,14 +648,15 @@ again:
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
                 */
-                if (next->offset - window_start > (bytes + empty_size) * 2) {
+                if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
+                    next->offset - window_start > (bytes + empty_size) * 2) {
                        entry = next;
                        window_start = entry->offset;
                        window_free = entry->bytes;
                        last = entry;
                        max_extent = 0;
                        total_retries++;
-                        if (total_retries % 256 == 0) {
+                        if (total_retries % 64 == 0) {
                                if (min_bytes >= (bytes + empty_size)) {
                                        ret = -ENOSPC;
                                        goto out;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ab0bdc0a63ce..266fb8764054 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -31,6 +31,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes);
 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster,
                             u64 offset, u64 bytes, u64 empty_size);
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 2a020b276768..db2ff9773b99 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,9 +19,9 @@
 #ifndef __HASH__
 #define __HASH__
-#include "crc32c.h"
+#include <linux/crc32c.h>
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
-        return btrfs_crc32c((u32)~1, name, len);
+        return crc32c((u32)~1, name, len);
 }
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c8b0190d031..8612b3a09811 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -48,7 +48,6 @@
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
-#include "ref-cache.h"
 #include "compression.h"
 #include "locking.h"
@@ -369,7 +368,7 @@ again:
         * inode has not been flagged as nocompress.  This flag can
         * change at any time if we discover bad compression ratios.
         */
-        if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
            btrfs_test_opt(root, COMPRESS)) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -470,7 +469,7 @@ again:
                nr_pages_ret = 0;
                /* flag the file so we don't compress in the future */
-                btrfs_set_flag(inode, NOCOMPRESS);
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
        }
        if (will_compress) {
                *num_added += 1;
@@ -863,7 +862,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                async_cow->locked_page = locked_page;
                async_cow->start = start;
-                if (btrfs_test_flag(inode, NOCOMPRESS))
+                if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
                        cur_end = end;
                else
                        cur_end = min(end, start + 512 * 1024 - 1);
@@ -944,6 +943,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        u64 cow_start;
        u64 cur_offset;
        u64 extent_end;
+        u64 extent_offset;
        u64 disk_bytenr;
        u64 num_bytes;
        int extent_type;
@@ -1005,6 +1005,7 @@ next_slot:
                if (extent_type == BTRFS_FILE_EXTENT_REG ||
                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                        extent_offset = btrfs_file_extent_offset(leaf, fi);
                        extent_end = found_key.offset +
                                btrfs_file_extent_num_bytes(leaf, fi);
                        if (extent_end <= start) {
@@ -1022,9 +1023,10 @@ next_slot:
                        if (btrfs_extent_readonly(root, disk_bytenr))
                                goto out_check;
                        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
-                                                  disk_bytenr))
+                                                  found_key.offset -
+                                                  extent_offset, disk_bytenr))
                                goto out_check;
-                        disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+                        disk_bytenr += extent_offset;
                        disk_bytenr += cur_offset - found_key.offset;
                        num_bytes = min(end + 1, extent_end) - cur_offset;
                        /*
@@ -1131,10 +1133,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        int ret;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        if (btrfs_test_flag(inode, NODATACOW))
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 1, nr_written);
-        else if (btrfs_test_flag(inode, PREALLOC))
+        else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
        else if (!btrfs_test_opt(root, COMPRESS))
@@ -1288,7 +1290,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        int ret = 0;
        int skip_sum;
-        skip_sum = btrfs_test_flag(inode, NODATASUM);
+        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
        BUG_ON(ret);
@@ -1489,9 +1491,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        ins.objectid = disk_bytenr;
        ins.offset = disk_num_bytes;
        ins.type = BTRFS_EXTENT_ITEM_KEY;
-        ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+        ret = btrfs_alloc_reserved_file_extent(trans, root,
-                                          root->root_key.objectid,
+                                        root->root_key.objectid,
-                                          trans->transid, inode->i_ino, &ins);
+                                        inode->i_ino, file_pos, &ins);
        BUG_ON(ret);
        btrfs_free_path(path);
@@ -1788,7 +1790,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                ClearPageChecked(page);
                goto good;
        }
-        if (btrfs_test_flag(inode, NODATASUM))
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                return 0;
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
@@ -1956,23 +1959,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * crossing root thing.  we store the inode number in the
                 * offset of the orphan item.
                 */
-                inode = btrfs_iget_locked(root->fs_info->sb,
+                found_key.objectid = found_key.offset;
-                                          found_key.offset, root);
+                found_key.type = BTRFS_INODE_ITEM_KEY;
-                if (!inode)
+                found_key.offset = 0;
+                inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+                if (IS_ERR(inode))
                        break;
-                if (inode->i_state & I_NEW) {
-                        BTRFS_I(inode)->root = root;
-                        /* have to set the location manually */
-                        BTRFS_I(inode)->location.objectid = inode->i_ino;
-                        BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-                        BTRFS_I(inode)->location.offset = 0;
-                        btrfs_read_locked_inode(inode);
-                        unlock_new_inode(inode);
-                }
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
@@ -2069,7 +2062,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
 * read an inode from the btree into the in-memory inode
 */
-void btrfs_read_locked_inode(struct inode *inode)
+static void btrfs_read_locked_inode(struct inode *inode)
 {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -2164,6 +2157,8 @@ void btrfs_read_locked_inode(struct inode *inode)
                init_special_inode(inode, inode->i_mode, rdev);
                break;
        }
+        btrfs_update_iflags(inode);
        return;
 make_bad:
@@ -2327,7 +2322,6 @@ err:
        btrfs_update_inode(trans, root, dir);
        btrfs_drop_nlink(inode);
        ret = btrfs_update_inode(trans, root, inode);
-        dir->i_sb->s_dirt = 1;
 out:
        return ret;
 }
@@ -2599,9 +2593,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        struct btrfs_file_extent_item *fi;
        u64 extent_start = 0;
        u64 extent_num_bytes = 0;
+        u64 extent_offset = 0;
        u64 item_end = 0;
-        u64 root_gen = 0;
-        u64 root_owner = 0;
        int found_extent;
        int del_item;
        int pending_del_nr = 0;
@@ -2716,6 +2709,9 @@ search_again:
                                extent_num_bytes =
                                        btrfs_file_extent_disk_num_bytes(leaf,
                                                                         fi);
+                                extent_offset = found_key.offset -
+                                        btrfs_file_extent_offset(leaf, fi);
                                /* FIXME blocksize != 4096 */
                                num_dec = btrfs_file_extent_num_bytes(leaf, fi);
                                if (extent_start != 0) {
@@ -2723,8 +2719,6 @@ search_again:
                                        if (root->ref_cows)
                                                inode_sub_bytes(inode, num_dec);
                                }
-                                root_gen = btrfs_header_generation(leaf);
-                                root_owner = btrfs_header_owner(leaf);
                        }
                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                        /*
@@ -2768,12 +2762,12 @@ delete:
                } else {
                        break;
                }
-                if (found_extent) {
+                if (found_extent && root->ref_cows) {
                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
-                                                extent_num_bytes,
+                                                extent_num_bytes, 0,
-                                                leaf->start, root_owner,
+                                                btrfs_header_owner(leaf),
-                                                root_gen, inode->i_ino, 0);
+                                                inode->i_ino, extent_offset);
                        BUG_ON(ret);
                }
 next:
@@ -2811,7 +2805,6 @@ error:
                                      pending_del_nr);
        }
        btrfs_free_path(path);
-        inode->i_sb->s_dirt = 1;
        return ret;
 }
@@ -3105,6 +3098,45 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        return 0;
 }
+static void inode_tree_add(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_inode *entry;
+        struct rb_node **p = &root->inode_tree.rb_node;
+        struct rb_node *parent = NULL;
+        spin_lock(&root->inode_lock);
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct btrfs_inode, rb_node);
+                if (inode->i_ino < entry->vfs_inode.i_ino)
+                        p = &(*p)->rb_left;
+                else if (inode->i_ino > entry->vfs_inode.i_ino)
+                        p = &(*p)->rb_right;
+                else {
+                        WARN_ON(!(entry->vfs_inode.i_state &
+                                  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+                        break;
+                }
+        }
+        rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
+        rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+        spin_unlock(&root->inode_lock);
+}
+static void inode_tree_del(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+                spin_lock(&root->inode_lock);
+                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+                spin_unlock(&root->inode_lock);
+                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+        }
+}
 static noinline void init_btrfs_i(struct inode *inode)
 {
        struct btrfs_inode *bi = BTRFS_I(inode);
@@ -3130,6 +3162,7 @@ static noinline void init_btrfs_i(struct inode *inode)
                             inode->i_mapping, GFP_NOFS);
        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
+        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
        mutex_init(&BTRFS_I(inode)->extent_mutex);
        mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3152,26 +3185,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
                args->root == BTRFS_I(inode)->root;
 }
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+static struct inode *btrfs_iget_locked(struct super_block *s,
-                            struct btrfs_root *root, int wait)
+                                       u64 objectid,
-{
+                                       struct btrfs_root *root)
-        struct inode *inode;
-        struct btrfs_iget_args args;
-        args.ino = objectid;
-        args.root = root;
-        if (wait) {
-                inode = ilookup5(s, objectid, btrfs_find_actor,
-                                 (void *)&args);
-        } else {
-                inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
-                                        (void *)&args);
-        }
-        return inode;
-}
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-                                struct btrfs_root *root)
 {
        struct inode *inode;
        struct btrfs_iget_args args;
@@ -3188,24 +3204,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 * Returns in *is_new if the inode was read from disk
 */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-                         struct btrfs_root *root, int *is_new)
+                         struct btrfs_root *root)
 {
        struct inode *inode;
        inode = btrfs_iget_locked(s, location->objectid, root);
        if (!inode)
-                return ERR_PTR(-EACCES);
+                return ERR_PTR(-ENOMEM);
        if (inode->i_state & I_NEW) {
                BTRFS_I(inode)->root = root;
                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
                btrfs_read_locked_inode(inode);
+                inode_tree_add(inode);
                unlock_new_inode(inode);
-                if (is_new)
-                        *is_new = 1;
-        } else {
-                if (is_new)
-                        *is_new = 0;
        }
        return inode;
@@ -3218,7 +3231,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        struct btrfs_root *root = bi->root;
        struct btrfs_root *sub_root = root;
        struct btrfs_key location;
-        int ret, new;
+        int ret;
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -3236,7 +3249,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                        return ERR_PTR(ret);
                if (ret > 0)
                        return ERR_PTR(-ENOENT);
-                inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+                inode = btrfs_iget(dir->i_sb, &location, sub_root);
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
        }
@@ -3574,9 +3587,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                        btrfs_find_block_group(root, 0, alloc_hint, owner);
        if ((mode & S_IFREG)) {
                if (btrfs_test_opt(root, NODATASUM))
-                        btrfs_set_flag(inode, NODATASUM);
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
                if (btrfs_test_opt(root, NODATACOW))
-                        btrfs_set_flag(inode, NODATACOW);
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
        }
        key[0].objectid = objectid;
@@ -3630,7 +3643,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        location->offset = 0;
        btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+        btrfs_inherit_iflags(inode, dir);
        insert_inode_hash(inode);
+        inode_tree_add(inode);
        return inode;
 fail:
        if (dir)
@@ -3750,7 +3766,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                init_special_inode(inode, inode->i_mode, rdev);
                btrfs_update_inode(trans, root, inode);
        }
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3815,7 +3830,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3862,7 +3876,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (err)
                drop_inode = 1;
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, dir);
        err = btrfs_update_inode(trans, root, inode);
@@ -3944,7 +3957,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        drop_on_err = 0;
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
@@ -4683,6 +4695,7 @@ void btrfs_destroy_inode(struct inode *inode)
                        btrfs_put_ordered_extent(ordered);
                }
        }
+        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
@@ -4972,7 +4985,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
        if (drop_inode)
@@ -5061,7 +5073,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 out:
        if (cur_offset > start) {
                inode->i_ctime = CURRENT_TIME;
-                btrfs_set_flag(inode, PREALLOC);
+                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    cur_offset > i_size_read(inode))
                        btrfs_i_size_write(inode, cur_offset);
@@ -5182,7 +5194,7 @@ static int btrfs_set_page_dirty(struct page *page)
 static int btrfs_permission(struct inode *inode, int mask)
 {
-        if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
        return generic_permission(inode, mask, btrfs_check_acl);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2624b53ea783..eff18f5b5362 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,7 +50,177 @@
 #include "volumes.h"
 #include "locking.h"
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & ~FS_DIRSYNC_FL;
+        else
+                return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+/*
+ * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ */
+static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+{
+        unsigned int iflags = 0;
+        if (flags & BTRFS_INODE_SYNC)
+                iflags |= FS_SYNC_FL;
+        if (flags & BTRFS_INODE_IMMUTABLE)
+                iflags |= FS_IMMUTABLE_FL;
+        if (flags & BTRFS_INODE_APPEND)
+                iflags |= FS_APPEND_FL;
+        if (flags & BTRFS_INODE_NODUMP)
+                iflags |= FS_NODUMP_FL;
+        if (flags & BTRFS_INODE_NOATIME)
+                iflags |= FS_NOATIME_FL;
+        if (flags & BTRFS_INODE_DIRSYNC)
+                iflags |= FS_DIRSYNC_FL;
+        return iflags;
+}
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_update_iflags(struct inode *inode)
+{
+        struct btrfs_inode *ip = BTRFS_I(inode);
+        inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+        if (ip->flags & BTRFS_INODE_SYNC)
+                inode->i_flags |= S_SYNC;
+        if (ip->flags & BTRFS_INODE_IMMUTABLE)
+                inode->i_flags |= S_IMMUTABLE;
+        if (ip->flags & BTRFS_INODE_APPEND)
+                inode->i_flags |= S_APPEND;
+        if (ip->flags & BTRFS_INODE_NOATIME)
+                inode->i_flags |= S_NOATIME;
+        if (ip->flags & BTRFS_INODE_DIRSYNC)
+                inode->i_flags |= S_DIRSYNC;
+}
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Unlike extN we don't have any flags we don't want to inherit currently.
+ */
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+        unsigned int flags;
+        if (!dir)
+                return;
+        flags = BTRFS_I(dir)->flags;
+        if (S_ISREG(inode->i_mode))
+                flags &= ~BTRFS_INODE_DIRSYNC;
+        else if (!S_ISDIR(inode->i_mode))
+                flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+        BTRFS_I(inode)->flags = flags;
+        btrfs_update_iflags(inode);
+}
+static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
+{
+        struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+        unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+        if (copy_to_user(arg, &flags, sizeof(flags)))
+                return -EFAULT;
+        return 0;
+}
+static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct btrfs_inode *ip = BTRFS_I(inode);
+        struct btrfs_root *root = ip->root;
+        struct btrfs_trans_handle *trans;
+        unsigned int flags, oldflags;
+        int ret;
+        if (copy_from_user(&flags, arg, sizeof(flags)))
+                return -EFAULT;
+        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+                      FS_NOATIME_FL | FS_NODUMP_FL | \
+                      FS_SYNC_FL | FS_DIRSYNC_FL))
+                return -EOPNOTSUPP;
+        if (!is_owner_or_cap(inode))
+                return -EACCES;
+        mutex_lock(&inode->i_mutex);
+        flags = btrfs_mask_flags(inode->i_mode, flags);
+        oldflags = btrfs_flags_to_ioctl(ip->flags);
+        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+                if (!capable(CAP_LINUX_IMMUTABLE)) {
+                        ret = -EPERM;
+                        goto out_unlock;
+                }
+        }
+        ret = mnt_want_write(file->f_path.mnt);
+        if (ret)
+                goto out_unlock;
+        if (flags & FS_SYNC_FL)
+                ip->flags |= BTRFS_INODE_SYNC;
+        else
+                ip->flags &= ~BTRFS_INODE_SYNC;
+        if (flags & FS_IMMUTABLE_FL)
+                ip->flags |= BTRFS_INODE_IMMUTABLE;
+        else
+                ip->flags &= ~BTRFS_INODE_IMMUTABLE;
+        if (flags & FS_APPEND_FL)
+                ip->flags |= BTRFS_INODE_APPEND;
+        else
+                ip->flags &= ~BTRFS_INODE_APPEND;
+        if (flags & FS_NODUMP_FL)
+                ip->flags |= BTRFS_INODE_NODUMP;
+        else
+                ip->flags &= ~BTRFS_INODE_NODUMP;
+        if (flags & FS_NOATIME_FL)
+                ip->flags |= BTRFS_INODE_NOATIME;
+        else
+                ip->flags &= ~BTRFS_INODE_NOATIME;
+        if (flags & FS_DIRSYNC_FL)
+                ip->flags |= BTRFS_INODE_DIRSYNC;
+        else
+                ip->flags &= ~BTRFS_INODE_DIRSYNC;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(!trans);
+        ret = btrfs_update_inode(trans, root, inode);
+        BUG_ON(ret);
+        btrfs_update_iflags(inode);
+        inode->i_ctime = CURRENT_TIME;
+        btrfs_end_transaction(trans, root);
+        mnt_drop_write(file->f_path.mnt);
+ out_unlock:
+        mutex_unlock(&inode->i_mutex);
+        return 0;
+}
+static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        return put_user(inode->i_generation, arg);
+}
 static noinline int create_subvol(struct btrfs_root *root,
                                  struct dentry *dentry,
@@ -82,22 +252,25 @@ static noinline int create_subvol(struct btrfs_root *root,
        if (ret)
                goto fail;
-        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-                                      objectid, trans->transid, 0, 0, 0);
+                                      0, objectid, NULL, 0, 0, 0);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                goto fail;
        }
-        btrfs_set_header_nritems(leaf, 0);
+        memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
-        btrfs_set_header_level(leaf, 0);
        btrfs_set_header_bytenr(leaf, leaf->start);
        btrfs_set_header_generation(leaf, trans->transid);
+        btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(leaf, objectid);
        write_extent_buffer(leaf, root->fs_info->fsid,
                            (unsigned long)btrfs_header_fsid(leaf),
                            BTRFS_FSID_SIZE);
+        write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+                            (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+                            BTRFS_UUID_SIZE);
        btrfs_mark_buffer_dirty(leaf);
        inode_item = &root_item.inode;
@@ -125,7 +298,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        btrfs_set_root_dirid(&root_item, new_dirid);
        key.objectid = objectid;
-        key.offset = 1;
+        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                &root_item);
@@ -911,10 +1084,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                if (disko) {
                                        inode_add_bytes(inode, datal);
                                        ret = btrfs_inc_extent_ref(trans, root,
-                                                   disko, diskl, leaf->start,
+                                                        disko, diskl, 0,
-                                                   root->root_key.objectid,
+                                                        root->root_key.objectid,
-                                                   trans->transid,
+                                                        inode->i_ino,
-                                                   inode->i_ino);
+                                                        new_key.offset - datao);
                                        BUG_ON(ret);
                                }
                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -1074,6 +1247,12 @@ long btrfs_ioctl(struct file *file, unsigned int
        void __user *argp = (void __user *)arg;
        switch (cmd) {
+        case FS_IOC_GETFLAGS:
+                return btrfs_ioctl_getflags(file, argp);
+        case FS_IOC_SETFLAGS:
+                return btrfs_ioctl_setflags(file, argp);
+        case FS_IOC_GETVERSION:
+                return btrfs_ioctl_getversion(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5f8f218c1005..6d6523da0a30 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb,
               (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
               (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
 }
+static void print_extent_data_ref(struct extent_buffer *eb,
+                                  struct btrfs_extent_data_ref *ref)
+{
+        printk(KERN_INFO "\t\textent data backref root %llu "
+               "objectid %llu offset %llu count %u\n",
+               (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
+               (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
+               (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+               btrfs_extent_data_ref_count(eb, ref));
+}
+static void print_extent_item(struct extent_buffer *eb, int slot)
+{
+        struct btrfs_extent_item *ei;
+        struct btrfs_extent_inline_ref *iref;
+        struct btrfs_extent_data_ref *dref;
+        struct btrfs_shared_data_ref *sref;
+        struct btrfs_disk_key key;
+        unsigned long end;
+        unsigned long ptr;
+        int type;
+        u32 item_size = btrfs_item_size_nr(eb, slot);
+        u64 flags;
+        u64 offset;
+        if (item_size < sizeof(*ei)) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                struct btrfs_extent_item_v0 *ei0;
+                BUG_ON(item_size != sizeof(*ei0));
+                ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
+                printk(KERN_INFO "\t\textent refs %u\n",
+                       btrfs_extent_refs_v0(eb, ei0));
+                return;
+#else
+                BUG();
+#endif
+        }
+        ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+        flags = btrfs_extent_flags(eb, ei);
+        printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
+               (unsigned long long)btrfs_extent_refs(eb, ei),
+               (unsigned long long)btrfs_extent_generation(eb, ei),
+               (unsigned long long)flags);
+        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                struct btrfs_tree_block_info *info;
+                info = (struct btrfs_tree_block_info *)(ei + 1);
+                btrfs_tree_block_key(eb, info, &key);
+                printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+                       "level %d\n",
+                       (unsigned long long)btrfs_disk_key_objectid(&key),
+                       key.type,
+                       (unsigned long long)btrfs_disk_key_offset(&key),
+                       btrfs_tree_block_level(eb, info));
+                iref = (struct btrfs_extent_inline_ref *)(info + 1);
+        } else {
+                iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+        }
+        ptr = (unsigned long)iref;
+        end = (unsigned long)ei + item_size;
+        while (ptr < end) {
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                type = btrfs_extent_inline_ref_type(eb, iref);
+                offset = btrfs_extent_inline_ref_offset(eb, iref);
+                switch (type) {
+                case BTRFS_TREE_BLOCK_REF_KEY:
+                        printk(KERN_INFO "\t\ttree block backref "
+                                "root %llu\n", (unsigned long long)offset);
+                        break;
+                case BTRFS_SHARED_BLOCK_REF_KEY:
+                        printk(KERN_INFO "\t\tshared block backref "
+                                "parent %llu\n", (unsigned long long)offset);
+                        break;
+                case BTRFS_EXTENT_DATA_REF_KEY:
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        print_extent_data_ref(eb, dref);
+                        break;
+                case BTRFS_SHARED_DATA_REF_KEY:
+                        sref = (struct btrfs_shared_data_ref *)(iref + 1);
+                        printk(KERN_INFO "\t\tshared data backref "
+                               "parent %llu count %u\n",
+                               (unsigned long long)offset,
+                               btrfs_shared_data_ref_count(eb, sref));
+                        break;
+                default:
+                        BUG();
+                }
+                ptr += btrfs_extent_inline_ref_size(type);
+        }
+        WARN_ON(ptr > end);
+}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
+{
+        struct btrfs_extent_ref_v0 *ref0;
+        ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
+        printk("\t\textent back ref root %llu gen %llu "
+                "owner %llu num_refs %lu\n",
+                (unsigned long long)btrfs_ref_root_v0(eb, ref0),
+                (unsigned long long)btrfs_ref_generation_v0(eb, ref0),
+                (unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+                (unsigned long)btrfs_ref_count_v0(eb, ref0));
+}
+#endif
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
        int i;
+        u32 type;
        u32 nr = btrfs_header_nritems(l);
        struct btrfs_item *item;
-        struct btrfs_extent_item *ei;
        struct btrfs_root_item *ri;
        struct btrfs_dir_item *di;
        struct btrfs_inode_item *ii;
        struct btrfs_block_group_item *bi;
        struct btrfs_file_extent_item *fi;
+        struct btrfs_extent_data_ref *dref;
+        struct btrfs_shared_data_ref *sref;
+        struct btrfs_dev_extent *dev_extent;
        struct btrfs_key key;
        struct btrfs_key found_key;
-        struct btrfs_extent_ref *ref;
-        struct btrfs_dev_extent *dev_extent;
-        u32 type;
        printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
                (unsigned long long)btrfs_header_bytenr(l), nr,
@@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                                btrfs_disk_root_refs(l, ri));
                        break;
                case BTRFS_EXTENT_ITEM_KEY:
-                        ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
+                        print_extent_item(l, i);
-                        printk(KERN_INFO "\t\textent data refs %u\n",
+                        break;
-                                btrfs_extent_refs(l, ei));
+                case BTRFS_TREE_BLOCK_REF_KEY:
-                        break;
+                        printk(KERN_INFO "\t\ttree block backref\n");
-                case BTRFS_EXTENT_REF_KEY:
+                        break;
-                        ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
+                case BTRFS_SHARED_BLOCK_REF_KEY:
-                        printk(KERN_INFO "\t\textent back ref root %llu "
+                        printk(KERN_INFO "\t\tshared block backref\n");
-                               "gen %llu owner %llu num_refs %lu\n",
+                        break;
-                               (unsigned long long)btrfs_ref_root(l, ref),
+                case BTRFS_EXTENT_DATA_REF_KEY:
-                               (unsigned long long)btrfs_ref_generation(l, ref),
+                        dref = btrfs_item_ptr(l, i,
-                               (unsigned long long)btrfs_ref_objectid(l, ref),
+                                              struct btrfs_extent_data_ref);
-                               (unsigned long)btrfs_ref_num_refs(l, ref));
+                        print_extent_data_ref(l, dref);
+                        break;
+                case BTRFS_SHARED_DATA_REF_KEY:
+                        sref = btrfs_item_ptr(l, i,
+                                              struct btrfs_shared_data_ref);
+                        printk(KERN_INFO "\t\tshared data backref count %u\n",
+                               btrfs_shared_data_ref_count(l, sref));
                        break;
                case BTRFS_EXTENT_DATA_KEY:
                        fi = btrfs_item_ptr(l, i,
                                            struct btrfs_file_extent_item);
@@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                               (unsigned long long)
                               btrfs_file_extent_ram_bytes(l, fi));
                        break;
+                case BTRFS_EXTENT_REF_V0_KEY:
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        print_extent_ref_v0(l, i);
+#else
+                        BUG();
+#endif
                case BTRFS_BLOCK_GROUP_ITEM_KEY:
                        bi = btrfs_item_ptr(l, i,
                                            struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 000000000000..b23dc209ae10
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3711 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+        struct rb_node rb_node;
+        u64 bytenr;
+};
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+        struct rb_node rb_node;
+        u64 bytenr;
+        /* objectid tree block owner */
+        u64 owner;
+        /* list of upper level blocks reference this block */
+        struct list_head upper;
+        /* list of child blocks in the cache */
+        struct list_head lower;
+        /* NULL if this node is not tree root */
+        struct btrfs_root *root;
+        /* extent buffer got by COW the block */
+        struct extent_buffer *eb;
+        /* level of tree block */
+        unsigned int level:8;
+        /* 1 if the block is root of old snapshot */
+        unsigned int old_root:1;
+        /* 1 if no child blocks in the cache */
+        unsigned int lowest:1;
+        /* is the extent buffer locked */
+        unsigned int locked:1;
+        /* has the block been processed */
+        unsigned int processed:1;
+        /* have backrefs of this block been checked */
+        unsigned int checked:1;
+};
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+        struct list_head list[2];
+        struct backref_node *node[2];
+        u64 blockptr;
+};
+#define LOWER   0
+#define UPPER   1
+struct backref_cache {
+        /* red black tree of all backref nodes in the cache */
+        struct rb_root rb_root;
+        /* list of backref nodes with no child block in the cache */
+        struct list_head pending[BTRFS_MAX_LEVEL];
+        spinlock_t lock;
+};
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+        struct rb_node rb_node;
+        u64 bytenr;
+        void *data;
+};
+struct mapping_tree {
+        struct rb_root rb_root;
+        spinlock_t lock;
+};
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+        struct rb_node rb_node;
+        u64 bytenr;
+        struct btrfs_key key;
+        unsigned int level:8;
+        unsigned int key_ready:1;
+};
+/* inode vector */
+#define INODEVEC_SIZE 16
+struct inodevec {
+        struct list_head list;
+        struct inode *inode[INODEVEC_SIZE];
+        int nr;
+};
+struct reloc_control {
+        /* block group to relocate */
+        struct btrfs_block_group_cache *block_group;
+        /* extent tree */
+        struct btrfs_root *extent_root;
+        /* inode for moving data */
+        struct inode *data_inode;
+        struct btrfs_workers workers;
+        /* tree blocks have been processed */
+        struct extent_io_tree processed_blocks;
+        /* map start of tree root to corresponding reloc tree */
+        struct mapping_tree reloc_root_tree;
+        /* list of reloc trees */
+        struct list_head reloc_roots;
+        u64 search_start;
+        u64 extents_found;
+        u64 extents_skipped;
+        int stage;
+        int create_reloc_root;
+        unsigned int found_file_extent:1;
+        unsigned int found_old_snapshot:1;
+};
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS       0
+#define UPDATE_DATA_PTRS        1
+/*
+ * merge reloc tree to corresponding fs tree in worker threads
+ */
+struct async_merge {
+        struct btrfs_work work;
+        struct reloc_control *rc;
+        struct btrfs_root *root;
+        struct completion *done;
+        atomic_t *num_pending;
+};
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+        tree->rb_root.rb_node = NULL;
+        spin_lock_init(&tree->lock);
+}
+static void backref_cache_init(struct backref_cache *cache)
+{
+        int i;
+        cache->rb_root.rb_node = NULL;
+        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+                INIT_LIST_HEAD(&cache->pending[i]);
+        spin_lock_init(&cache->lock);
+}
+static void backref_node_init(struct backref_node *node)
+{
+        memset(node, 0, sizeof(*node));
+        INIT_LIST_HEAD(&node->upper);
+        INIT_LIST_HEAD(&node->lower);
+        RB_CLEAR_NODE(&node->rb_node);
+}
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+                                   struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct tree_entry *entry;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct tree_entry, rb_node);
+                if (bytenr < entry->bytenr)
+                        p = &(*p)->rb_left;
+                else if (bytenr > entry->bytenr)
+                        p = &(*p)->rb_right;
+                else
+                        return parent;
+        }
+        rb_link_node(node, parent, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+        struct rb_node *n = root->rb_node;
+        struct tree_entry *entry;
+        while (n) {
+                entry = rb_entry(n, struct tree_entry, rb_node);
+                if (bytenr < entry->bytenr)
+                        n = n->rb_left;
+                else if (bytenr > entry->bytenr)
+                        n = n->rb_right;
+                else
+                        return n;
+        }
+        return NULL;
+}
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+                                            struct backref_edge *edges[],
+                                            int *index)
+{
+        struct backref_edge *edge;
+        int idx = *index;
+        while (!list_empty(&node->upper)) {
+                edge = list_entry(node->upper.next,
+                                  struct backref_edge, list[LOWER]);
+                edges[idx++] = edge;
+                node = edge->node[UPPER];
+        }
+        *index = idx;
+        return node;
+}
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+                                              int *index)
+{
+        struct backref_edge *edge;
+        struct backref_node *lower;
+        int idx = *index;
+        while (idx > 0) {
+                edge = edges[idx - 1];
+                lower = edge->node[LOWER];
+                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+                        idx--;
+                        continue;
+                }
+                edge = list_entry(edge->list[LOWER].next,
+                                  struct backref_edge, list[LOWER]);
+                edges[idx - 1] = edge;
+                *index = idx;
+                return edge->node[UPPER];
+        }
+        *index = 0;
+        return NULL;
+}
+static void drop_node_buffer(struct backref_node *node)
+{
+        if (node->eb) {
+                if (node->locked) {
+                        btrfs_tree_unlock(node->eb);
+                        node->locked = 0;
+                }
+                free_extent_buffer(node->eb);
+                node->eb = NULL;
+        }
+}
+static void drop_backref_node(struct backref_cache *tree,
+                              struct backref_node *node)
+{
+        BUG_ON(!node->lowest);
+        BUG_ON(!list_empty(&node->upper));
+        drop_node_buffer(node);
+        list_del(&node->lower);
+        rb_erase(&node->rb_node, &tree->rb_root);
+        kfree(node);
+}
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+                                struct backref_node *node)
+{
+        struct backref_node *upper;
+        struct backref_edge *edge;
+        if (!node)
+                return;
+        BUG_ON(!node->lowest);
+        while (!list_empty(&node->upper)) {
+                edge = list_entry(node->upper.next, struct backref_edge,
+                                  list[LOWER]);
+                upper = edge->node[UPPER];
+                list_del(&edge->list[LOWER]);
+                list_del(&edge->list[UPPER]);
+                kfree(edge);
+                /*
+                 * add the node to pending list if no other
+                 * child block cached.
+                 */
+                if (list_empty(&upper->lower)) {
+                        list_add_tail(&upper->lower,
+                                      &cache->pending[upper->level]);
+                        upper->lowest = 1;
+                }
+        }
+        drop_backref_node(cache, node);
+}
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+                                          u64 bytenr)
+{
+        struct rb_node *rb_node;
+        struct mapping_node *node;
+        struct btrfs_root *root = NULL;
+        spin_lock(&rc->reloc_root_tree.lock);
+        rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct mapping_node, rb_node);
+                root = (struct btrfs_root *)node->data;
+        }
+        spin_unlock(&rc->reloc_root_tree.lock);
+        return root;
+}
+static int is_cowonly_root(u64 root_objectid)
+{
+        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+                return 1;
+        return 0;
+}
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+                                        u64 root_objectid)
+{
+        struct btrfs_key key;
+        key.objectid = root_objectid;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        if (is_cowonly_root(root_objectid))
+                key.offset = 0;
+        else
+                key.offset = (u64)-1;
+        return btrfs_read_fs_root_no_name(fs_info, &key);
+}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+                                  struct extent_buffer *leaf,
+                                  struct btrfs_extent_ref_v0 *ref0)
+{
+        struct btrfs_root *root;
+        u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+        u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+        BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+        root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+        BUG_ON(IS_ERR(root));
+        if (root->ref_cows &&
+            generation != btrfs_root_generation(&root->root_item))
+                return NULL;
+        return root;
+}
+#endif
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+                        unsigned long *ptr, unsigned long *end)
+{
+        struct btrfs_extent_item *ei;
+        struct btrfs_tree_block_info *bi;
+        u32 item_size;
+        item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (item_size < sizeof(*ei)) {
+                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+                return 1;
+        }
+#endif
+        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+        WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+                  BTRFS_EXTENT_FLAG_TREE_BLOCK));
+        if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+                WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+                return 1;
+        }
+        bi = (struct btrfs_tree_block_info *)(ei + 1);
+        *ptr = (unsigned long)(bi + 1);
+        *end = (unsigned long)ei + item_size;
+        return 0;
+}
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static struct backref_node *build_backref_tree(struct reloc_control *rc,
+                                               struct backref_cache *cache,
+                                               struct btrfs_key *node_key,
+                                               int level, u64 bytenr)
+{
+        struct btrfs_path *path1;
+        struct btrfs_path *path2;
+        struct extent_buffer *eb;
+        struct btrfs_root *root;
+        struct backref_node *cur;
+        struct backref_node *upper;
+        struct backref_node *lower;
+        struct backref_node *node = NULL;
+        struct backref_node *exist = NULL;
+        struct backref_edge *edge;
+        struct rb_node *rb_node;
+        struct btrfs_key key;
+        unsigned long end;
+        unsigned long ptr;
+        LIST_HEAD(list);
+        int ret;
+        int err = 0;
+        path1 = btrfs_alloc_path();
+        path2 = btrfs_alloc_path();
+        if (!path1 || !path2) {
+                err = -ENOMEM;
+                goto out;
+        }
+        node = kmalloc(sizeof(*node), GFP_NOFS);
+        if (!node) {
+                err = -ENOMEM;
+                goto out;
+        }
+        backref_node_init(node);
+        node->bytenr = bytenr;
+        node->owner = 0;
+        node->level = level;
+        node->lowest = 1;
+        cur = node;
+again:
+        end = 0;
+        ptr = 0;
+        key.objectid = cur->bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = (u64)-1;
+        path1->search_commit_root = 1;
+        path1->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+                                0, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        BUG_ON(!ret || !path1->slots[0]);
+        path1->slots[0]--;
+        WARN_ON(cur->checked);
+        if (!list_empty(&cur->upper)) {
+                /*
+                 * the backref was added previously when processsing
+                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
+                 */
+                BUG_ON(!list_is_singular(&cur->upper));
+                edge = list_entry(cur->upper.next, struct backref_edge,
+                                  list[LOWER]);
+                BUG_ON(!list_empty(&edge->list[UPPER]));
+                exist = edge->node[UPPER];
+                /*
+                 * add the upper level block to pending list if we need
+                 * check its backrefs
+                 */
+                if (!exist->checked)
+                        list_add_tail(&edge->list[UPPER], &list);
+        } else {
+                exist = NULL;
+        }
+        while (1) {
+                cond_resched();
+                eb = path1->nodes[0];
+                if (ptr >= end) {
+                        if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+                                ret = btrfs_next_leaf(rc->extent_root, path1);
+                                if (ret < 0) {
+                                        err = ret;
+                                        goto out;
+                                }
+                                if (ret > 0)
+                                        break;
+                                eb = path1->nodes[0];
+                        }
+                        btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+                        if (key.objectid != cur->bytenr) {
+                                WARN_ON(exist);
+                                break;
+                        }
+                        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+                                ret = find_inline_backref(eb, path1->slots[0],
+                                                          &ptr, &end);
+                                if (ret)
+                                        goto next;
+                        }
+                }
+                if (ptr < end) {
+                        /* update key for inline back ref */
+                        struct btrfs_extent_inline_ref *iref;
+                        iref = (struct btrfs_extent_inline_ref *)ptr;
+                        key.type = btrfs_extent_inline_ref_type(eb, iref);
+                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+                        WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+                                key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+                }
+                if (exist &&
+                    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+                      exist->owner == key.offset) ||
+                     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+                      exist->bytenr == key.offset))) {
+                        exist = NULL;
+                        goto next;
+                }
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                        if (key.objectid == key.offset &&
+                            key.type == BTRFS_EXTENT_REF_V0_KEY) {
+                                struct btrfs_extent_ref_v0 *ref0;
+                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
+                                                struct btrfs_extent_ref_v0);
+                                root = find_tree_root(rc, eb, ref0);
+                                if (root)
+                                        cur->root = root;
+                                else
+                                        cur->old_root = 1;
+                                break;
+                        }
+#else
+                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+                        if (key.objectid == key.offset) {
+                                /*
+                                 * only root blocks of reloc trees use
+                                 * backref of this type.
+                                 */
+                                root = find_reloc_root(rc, cur->bytenr);
+                                BUG_ON(!root);
+                                cur->root = root;
+                                break;
+                        }
+                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        if (!edge) {
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        rb_node = tree_search(&cache->rb_root, key.offset);
+                        if (!rb_node) {
+                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                if (!upper) {
+                                        kfree(edge);
+                                        err = -ENOMEM;
+                                        goto out;
+                                }
+                                backref_node_init(upper);
+                                upper->bytenr = key.offset;
+                                upper->owner = 0;
+                                upper->level = cur->level + 1;
+                                /*
+                                 *  backrefs for the upper level block isn't
+                                 *  cached, add the block to pending list
+                                 */
+                                list_add_tail(&edge->list[UPPER], &list);
+                        } else {
+                                upper = rb_entry(rb_node, struct backref_node,
+                                                 rb_node);
+                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                        }
+                        list_add(&edge->list[LOWER], &cur->upper);
+                        edge->node[UPPER] = upper;
+                        edge->node[LOWER] = cur;
+                        goto next;
+                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+                        goto next;
+                }
+                /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+                root = read_fs_root(rc->extent_root->fs_info, key.offset);
+                if (IS_ERR(root)) {
+                        err = PTR_ERR(root);
+                        goto out;
+                }
+                if (btrfs_root_level(&root->root_item) == cur->level) {
+                        /* tree root */
+                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+                               cur->bytenr);
+                        cur->root = root;
+                        break;
+                }
+                level = cur->level + 1;
+                /*
+                 * searching the tree to find upper level blocks
+                 * reference the block.
+                 */
+                path2->search_commit_root = 1;
+                path2->skip_locking = 1;
+                path2->lowest_level = level;
+                ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+                path2->lowest_level = 0;
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                eb = path2->nodes[level];
+                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+                        cur->bytenr);
+                lower = cur;
+                for (; level < BTRFS_MAX_LEVEL; level++) {
+                        if (!path2->nodes[level]) {
+                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+                                       lower->bytenr);
+                                lower->root = root;
+                                break;
+                        }
+                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        if (!edge) {
+                                err = -ENOMEM;
+                                goto out;
+                        }
+                        eb = path2->nodes[level];
+                        rb_node = tree_search(&cache->rb_root, eb->start);
+                        if (!rb_node) {
+                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                if (!upper) {
+                                        kfree(edge);
+                                        err = -ENOMEM;
+                                        goto out;
+                                }
+                                backref_node_init(upper);
+                                upper->bytenr = eb->start;
+                                upper->owner = btrfs_header_owner(eb);
+                                upper->level = lower->level + 1;
+                                /*
+                                 * if we know the block isn't shared
+                                 * we can void checking its backrefs.
+                                 */
+                                if (btrfs_block_can_be_shared(root, eb))
+                                        upper->checked = 0;
+                                else
+                                        upper->checked = 1;
+                                /*
+                                 * add the block to pending list if we
+                                 * need check its backrefs. only block
+                                 * at 'cur->level + 1' is added to the
+                                 * tail of pending list. this guarantees
+                                 * we check backrefs from lower level
+                                 * blocks to upper level blocks.
+                                 */
+                                if (!upper->checked &&
+                                    level == cur->level + 1) {
+                                        list_add_tail(&edge->list[UPPER],
+                                                      &list);
+                                } else
+                                        INIT_LIST_HEAD(&edge->list[UPPER]);
+                        } else {
+                                upper = rb_entry(rb_node, struct backref_node,
+                                                 rb_node);
+                                BUG_ON(!upper->checked);
+                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                        }
+                        list_add_tail(&edge->list[LOWER], &lower->upper);
+                        edge->node[UPPER] = upper;
+                        edge->node[LOWER] = lower;
+                        if (rb_node)
+                                break;
+                        lower = upper;
+                        upper = NULL;
+                }
+                btrfs_release_path(root, path2);
+next:
+                if (ptr < end) {
+                        ptr += btrfs_extent_inline_ref_size(key.type);
+                        if (ptr >= end) {
+                                WARN_ON(ptr > end);
+                                ptr = 0;
+                                end = 0;
+                        }
+                }
+                if (ptr >= end)
+                        path1->slots[0]++;
+        }
+        btrfs_release_path(rc->extent_root, path1);
+        cur->checked = 1;
+        WARN_ON(exist);
+        /* the pending list isn't empty, take the first block to process */
+        if (!list_empty(&list)) {
+                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+                list_del_init(&edge->list[UPPER]);
+                cur = edge->node[UPPER];
+                goto again;
+        }
+        /*
+         * everything goes well, connect backref nodes and insert backref nodes
+         * into the cache.
+         */
+        BUG_ON(!node->checked);
+        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        BUG_ON(rb_node);
+        list_for_each_entry(edge, &node->upper, list[LOWER])
+                list_add_tail(&edge->list[UPPER], &list);
+        while (!list_empty(&list)) {
+                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+                list_del_init(&edge->list[UPPER]);
+                upper = edge->node[UPPER];
+                if (!RB_EMPTY_NODE(&upper->rb_node)) {
+                        if (upper->lowest) {
+                                list_del_init(&upper->lower);
+                                upper->lowest = 0;
+                        }
+                        list_add_tail(&edge->list[UPPER], &upper->lower);
+                        continue;
+                }
+                BUG_ON(!upper->checked);
+                rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                                      &upper->rb_node);
+                BUG_ON(rb_node);
+                list_add_tail(&edge->list[UPPER], &upper->lower);
+                list_for_each_entry(edge, &upper->upper, list[LOWER])
+                        list_add_tail(&edge->list[UPPER], &list);
+        }
+out:
+        btrfs_free_path(path1);
+        btrfs_free_path(path2);
+        if (err) {
+                INIT_LIST_HEAD(&list);
+                upper = node;
+                while (upper) {
+                        if (RB_EMPTY_NODE(&upper->rb_node)) {
+                                list_splice_tail(&upper->upper, &list);
+                                kfree(upper);
+                        }
+                        if (list_empty(&list))
+                                break;
+                        edge = list_entry(list.next, struct backref_edge,
+                                          list[LOWER]);
+                        upper = edge->node[UPPER];
+                        kfree(edge);
+                }
+                return ERR_PTR(err);
+        }
+        return node;
+}
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __add_reloc_root(struct btrfs_root *root)
+{
+        struct rb_node *rb_node;
+        struct mapping_node *node;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        node = kmalloc(sizeof(*node), GFP_NOFS);
+        BUG_ON(!node);
+        node->bytenr = root->node->start;
+        node->data = root;
+        spin_lock(&rc->reloc_root_tree.lock);
+        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+                              node->bytenr, &node->rb_node);
+        spin_unlock(&rc->reloc_root_tree.lock);
+        BUG_ON(rb_node);
+        list_add_tail(&root->root_list, &rc->reloc_roots);
+        return 0;
+}
+/*
+ * helper to update/delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, int del)
+{
+        struct rb_node *rb_node;
+        struct mapping_node *node = NULL;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        spin_lock(&rc->reloc_root_tree.lock);
+        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+                              root->commit_root->start);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct mapping_node, rb_node);
+                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+        }
+        spin_unlock(&rc->reloc_root_tree.lock);
+        BUG_ON((struct btrfs_root *)node->data != root);
+        if (!del) {
+                spin_lock(&rc->reloc_root_tree.lock);
+                node->bytenr = root->node->start;
+                rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+                                      node->bytenr, &node->rb_node);
+                spin_unlock(&rc->reloc_root_tree.lock);
+                BUG_ON(rb_node);
+        } else {
+                list_del_init(&root->root_list);
+                kfree(node);
+        }
+        return 0;
+}
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct extent_buffer *eb;
+        struct btrfs_root_item *root_item;
+        struct btrfs_key root_key;
+        int ret;
+        if (root->reloc_root) {
+                reloc_root = root->reloc_root;
+                reloc_root->last_trans = trans->transid;
+                return 0;
+        }
+        if (!root->fs_info->reloc_ctl ||
+            !root->fs_info->reloc_ctl->create_reloc_root ||
+            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                return 0;
+        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+        BUG_ON(!root_item);
+        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = root->root_key.objectid;
+        ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+                              BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(ret);
+        btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
+        memcpy(root_item, &root->root_item, sizeof(*root_item));
+        btrfs_set_root_refs(root_item, 1);
+        btrfs_set_root_bytenr(root_item, eb->start);
+        btrfs_set_root_level(root_item, btrfs_header_level(eb));
+        btrfs_set_root_generation(root_item, trans->transid);
+        memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
+        root_item->drop_level = 0;
+        btrfs_tree_unlock(eb);
+        free_extent_buffer(eb);
+        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+                                &root_key, root_item);
+        BUG_ON(ret);
+        kfree(root_item);
+        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+                                                 &root_key);
+        BUG_ON(IS_ERR(reloc_root));
+        reloc_root->last_trans = trans->transid;
+        __add_reloc_root(reloc_root);
+        root->reloc_root = reloc_root;
+        return 0;
+}
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct btrfs_root_item *root_item;
+        int del = 0;
+        int ret;
+        if (!root->reloc_root)
+                return 0;
+        reloc_root = root->reloc_root;
+        root_item = &reloc_root->root_item;
+        if (btrfs_root_refs(root_item) == 0) {
+                root->reloc_root = NULL;
+                del = 1;
+        }
+        __update_reloc_root(reloc_root, del);
+        if (reloc_root->commit_root != reloc_root->node) {
+                btrfs_set_root_node(root_item, reloc_root->node);
+                free_extent_buffer(reloc_root->commit_root);
+                reloc_root->commit_root = btrfs_root_node(reloc_root);
+        }
+        ret = btrfs_update_root(trans, root->fs_info->tree_root,
+                                &reloc_root->root_key, root_item);
+        BUG_ON(ret);
+        return 0;
+}
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+        struct rb_node *node;
+        struct rb_node *prev;
+        struct btrfs_inode *entry;
+        struct inode *inode;
+        spin_lock(&root->inode_lock);
+again:
+        node = root->inode_tree.rb_node;
+        prev = NULL;
+        while (node) {
+                prev = node;
+                entry = rb_entry(node, struct btrfs_inode, rb_node);
+                if (objectid < entry->vfs_inode.i_ino)
+                        node = node->rb_left;
+                else if (objectid > entry->vfs_inode.i_ino)
+                        node = node->rb_right;
+                else
+                        break;
+        }
+        if (!node) {
+                while (prev) {
+                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
+                        if (objectid <= entry->vfs_inode.i_ino) {
+                                node = prev;
+                                break;
+                        }
+                        prev = rb_next(prev);
+                }
+        }
+        while (node) {
+                entry = rb_entry(node, struct btrfs_inode, rb_node);
+                inode = igrab(&entry->vfs_inode);
+                if (inode) {
+                        spin_unlock(&root->inode_lock);
+                        return inode;
+                }
+                objectid = entry->vfs_inode.i_ino + 1;
+                if (cond_resched_lock(&root->inode_lock))
+                        goto again;
+                node = rb_next(node);
+        }
+        spin_unlock(&root->inode_lock);
+        return NULL;
+}
+static int in_block_group(u64 bytenr,
+                          struct btrfs_block_group_cache *block_group)
+{
+        if (bytenr >= block_group->key.objectid &&
+            bytenr < block_group->key.objectid + block_group->key.offset)
+                return 1;
+        return 0;
+}
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+                            u64 bytenr, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+                                       bytenr, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = -ENOENT;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        fi = btrfs_item_ptr(leaf, path->slots[0],
+                            struct btrfs_file_extent_item);
+        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+               btrfs_file_extent_compression(leaf, fi) ||
+               btrfs_file_extent_encryption(leaf, fi) ||
+               btrfs_file_extent_other_encoding(leaf, fi));
+        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+                ret = 1;
+                goto out;
+        }
+        if (new_bytenr)
+                *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static int replace_file_extents(struct btrfs_trans_handle *trans,
+                                struct reloc_control *rc,
+                                struct btrfs_root *root,
+                                struct extent_buffer *leaf,
+                                struct list_head *inode_list)
+{
+        struct btrfs_key key;
+        struct btrfs_file_extent_item *fi;
+        struct inode *inode = NULL;
+        struct inodevec *ivec = NULL;
+        u64 parent;
+        u64 bytenr;
+        u64 new_bytenr;
+        u64 num_bytes;
+        u64 end;
+        u32 nritems;
+        u32 i;
+        int ret;
+        int first = 1;
+        int dirty = 0;
+        if (rc->stage != UPDATE_DATA_PTRS)
+                return 0;
+        /* reloc trees always use full backref */
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                parent = leaf->start;
+        else
+                parent = 0;
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                btrfs_item_key_to_cpu(leaf, &key, i);
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+                if (bytenr == 0)
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group))
+                        continue;
+                /*
+                 * if we are modifying block in fs tree, wait for readpage
+                 * to complete and drop the extent cache
+                 */
+                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+                        if (!ivec || ivec->nr == INODEVEC_SIZE) {
+                                ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
+                                BUG_ON(!ivec);
+                                ivec->nr = 0;
+                                list_add_tail(&ivec->list, inode_list);
+                        }
+                        if (first) {
+                                inode = find_next_inode(root, key.objectid);
+                                if (inode)
+                                        ivec->inode[ivec->nr++] = inode;
+                                first = 0;
+                        } else if (inode && inode->i_ino < key.objectid) {
+                                inode = find_next_inode(root, key.objectid);
+                                if (inode)
+                                        ivec->inode[ivec->nr++] = inode;
+                        }
+                        if (inode && inode->i_ino == key.objectid) {
+                                end = key.offset +
+                                      btrfs_file_extent_num_bytes(leaf, fi);
+                                WARN_ON(!IS_ALIGNED(key.offset,
+                                                    root->sectorsize));
+                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+                                end--;
+                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                                                      key.offset, end,
+                                                      GFP_NOFS);
+                                if (!ret)
+                                        continue;
+                                btrfs_drop_extent_cache(inode, key.offset, end,
+                                                        1);
+                                unlock_extent(&BTRFS_I(inode)->io_tree,
+                                              key.offset, end, GFP_NOFS);
+                        }
+                }
+                ret = get_new_location(rc->data_inode, &new_bytenr,
+                                       bytenr, num_bytes);
+                if (ret > 0)
+                        continue;
+                BUG_ON(ret < 0);
+                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+                dirty = 1;
+                key.offset -= btrfs_file_extent_offset(leaf, fi);
+                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+                                           num_bytes, parent,
+                                           btrfs_header_owner(leaf),
+                                           key.objectid, key.offset);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                        parent, btrfs_header_owner(leaf),
+                                        key.objectid, key.offset);
+                BUG_ON(ret);
+        }
+        if (dirty)
+                btrfs_mark_buffer_dirty(leaf);
+        return 0;
+}
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+                     struct btrfs_path *path, int level)
+{
+        struct btrfs_disk_key key1;
+        struct btrfs_disk_key key2;
+        btrfs_node_key(eb, &key1, slot);
+        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+        return memcmp(&key1, &key2, sizeof(key1));
+}
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static int replace_path(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *dest, struct btrfs_root *src,
+                        struct btrfs_path *path, struct btrfs_key *next_key,
+                        struct extent_buffer **leaf,
+                        int lowest_level, int max_level)
+{
+        struct extent_buffer *eb;
+        struct extent_buffer *parent;
+        struct btrfs_key key;
+        u64 old_bytenr;
+        u64 new_bytenr;
+        u64 old_ptr_gen;
+        u64 new_ptr_gen;
+        u64 last_snapshot;
+        u32 blocksize;
+        int level;
+        int ret;
+        int slot;
+        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+        BUG_ON(lowest_level > 1 && leaf);
+        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+        slot = path->slots[lowest_level];
+        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+        eb = btrfs_lock_root_node(dest);
+        btrfs_set_lock_blocking(eb);
+        level = btrfs_header_level(eb);
+        if (level < lowest_level) {
+                btrfs_tree_unlock(eb);
+                free_extent_buffer(eb);
+                return 0;
+        }
+        ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+        BUG_ON(ret);
+        btrfs_set_lock_blocking(eb);
+        if (next_key) {
+                next_key->objectid = (u64)-1;
+                next_key->type = (u8)-1;
+                next_key->offset = (u64)-1;
+        }
+        parent = eb;
+        while (1) {
+                level = btrfs_header_level(parent);
+                BUG_ON(level < lowest_level);
+                ret = btrfs_bin_search(parent, &key, level, &slot);
+                if (ret && slot > 0)
+                        slot--;
+                if (next_key && slot + 1 < btrfs_header_nritems(parent))
+                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+                old_bytenr = btrfs_node_blockptr(parent, slot);
+                blocksize = btrfs_level_size(dest, level - 1);
+                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+                if (level <= max_level) {
+                        eb = path->nodes[level];
+                        new_bytenr = btrfs_node_blockptr(eb,
+                                                        path->slots[level]);
+                        new_ptr_gen = btrfs_node_ptr_generation(eb,
+                                                        path->slots[level]);
+                } else {
+                        new_bytenr = 0;
+                        new_ptr_gen = 0;
+                }
+                if (new_bytenr > 0 && new_bytenr == old_bytenr) {
+                        WARN_ON(1);
+                        ret = level;
+                        break;
+                }
+                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+                    memcmp_node_keys(parent, slot, path, level)) {
+                        if (level <= lowest_level && !leaf) {
+                                ret = 0;
+                                break;
+                        }
+                        eb = read_tree_block(dest, old_bytenr, blocksize,
+                                             old_ptr_gen);
+                        btrfs_tree_lock(eb);
+                        ret = btrfs_cow_block(trans, dest, eb, parent,
+                                              slot, &eb);
+                        BUG_ON(ret);
+                        btrfs_set_lock_blocking(eb);
+                        if (level <= lowest_level) {
+                                *leaf = eb;
+                                ret = 0;
+                                break;
+                        }
+                        btrfs_tree_unlock(parent);
+                        free_extent_buffer(parent);
+                        parent = eb;
+                        continue;
+                }
+                btrfs_node_key_to_cpu(path->nodes[level], &key,
+                                      path->slots[level]);
+                btrfs_release_path(src, path);
+                path->lowest_level = level;
+                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+                path->lowest_level = 0;
+                BUG_ON(ret);
+                /*
+                 * swap blocks in fs tree and reloc tree.
+                 */
+                btrfs_set_node_blockptr(parent, slot, new_bytenr);
+                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+                btrfs_mark_buffer_dirty(parent);
+                btrfs_set_node_blockptr(path->nodes[level],
+                                        path->slots[level], old_bytenr);
+                btrfs_set_node_ptr_generation(path->nodes[level],
+                                              path->slots[level], old_ptr_gen);
+                btrfs_mark_buffer_dirty(path->nodes[level]);
+                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
+                                        path->nodes[level]->start,
+                                        src->root_key.objectid, level - 1, 0);
+                BUG_ON(ret);
+                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
+                                        0, dest->root_key.objectid, level - 1,
+                                        0);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+                                        path->nodes[level]->start,
+                                        src->root_key.objectid, level - 1, 0);
+                BUG_ON(ret);
+                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+                                        0, dest->root_key.objectid, level - 1,
+                                        0);
+                BUG_ON(ret);
+                btrfs_unlock_up_safe(path, 0);
+                ret = level;
+                break;
+        }
+        btrfs_tree_unlock(parent);
+        free_extent_buffer(parent);
+        return ret;
+}
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+                       int *level)
+{
+        struct extent_buffer *eb;
+        int i;
+        u64 last_snapshot;
+        u32 nritems;
+        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        for (i = 0; i < *level; i++) {
+                free_extent_buffer(path->nodes[i]);
+                path->nodes[i] = NULL;
+        }
+        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+                eb = path->nodes[i];
+                nritems = btrfs_header_nritems(eb);
+                while (path->slots[i] + 1 < nritems) {
+                        path->slots[i]++;
+                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+                            last_snapshot)
+                                continue;
+                        *level = i;
+                        return 0;
+                }
+                free_extent_buffer(path->nodes[i]);
+                path->nodes[i] = NULL;
+        }
+        return 1;
+}
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+                         int *level)
+{
+        struct extent_buffer *eb = NULL;
+        int i;
+        u64 bytenr;
+        u64 ptr_gen = 0;
+        u64 last_snapshot;
+        u32 blocksize;
+        u32 nritems;
+        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+        for (i = *level; i > 0; i--) {
+                eb = path->nodes[i];
+                nritems = btrfs_header_nritems(eb);
+                while (path->slots[i] < nritems) {
+                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+                        if (ptr_gen > last_snapshot)
+                                break;
+                        path->slots[i]++;
+                }
+                if (path->slots[i] >= nritems) {
+                        if (i == *level)
+                                break;
+                        *level = i + 1;
+                        return 0;
+                }
+                if (i == 1) {
+                        *level = i;
+                        return 0;
+                }
+                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
+                blocksize = btrfs_level_size(root, i - 1);
+                eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+                BUG_ON(btrfs_header_level(eb) != i - 1);
+                path->nodes[i - 1] = eb;
+                path->slots[i - 1] = 0;
+        }
+        return 1;
+}
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+                                   struct btrfs_key *min_key,
+                                   struct btrfs_key *max_key)
+{
+        struct inode *inode = NULL;
+        u64 objectid;
+        u64 start, end;
+        objectid = min_key->objectid;
+        while (1) {
+                cond_resched();
+                iput(inode);
+                if (objectid > max_key->objectid)
+                        break;
+                inode = find_next_inode(root, objectid);
+                if (!inode)
+                        break;
+                if (inode->i_ino > max_key->objectid) {
+                        iput(inode);
+                        break;
+                }
+                objectid = inode->i_ino + 1;
+                if (!S_ISREG(inode->i_mode))
+                        continue;
+                if (unlikely(min_key->objectid == inode->i_ino)) {
+                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+                                start = 0;
+                        else {
+                                start = min_key->offset;
+                                WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+                        }
+                } else {
+                        start = 0;
+                }
+                if (unlikely(max_key->objectid == inode->i_ino)) {
+                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+                                continue;
+                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+                                end = (u64)-1;
+                        } else {
+                                if (max_key->offset == 0)
+                                        continue;
+                                end = max_key->offset;
+                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+                                end--;
+                        }
+                } else {
+                        end = (u64)-1;
+                }
+                /* the lock_extent waits for readpage to complete */
+                lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                btrfs_drop_extent_cache(inode, start, end, 1);
+                unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+        }
+        return 0;
+}
+static int find_next_key(struct btrfs_path *path, int level,
+                         struct btrfs_key *key)
+{
+        while (level < BTRFS_MAX_LEVEL) {
+                if (!path->nodes[level])
+                        break;
+                if (path->slots[level] + 1 <
+                    btrfs_header_nritems(path->nodes[level])) {
+                        btrfs_node_key_to_cpu(path->nodes[level], key,
+                                              path->slots[level] + 1);
+                        return 0;
+                }
+                level++;
+        }
+        return 1;
+}
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+                                               struct btrfs_root *root)
+{
+        LIST_HEAD(inode_list);
+        struct btrfs_key key;
+        struct btrfs_key next_key;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root_item *root_item;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf = NULL;
+        unsigned long nr;
+        int level;
+        int max_level;
+        int replaced = 0;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        reloc_root = root->reloc_root;
+        root_item = &reloc_root->root_item;
+        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+                level = btrfs_root_level(root_item);
+                extent_buffer_get(reloc_root->node);
+                path->nodes[level] = reloc_root->node;
+                path->slots[level] = 0;
+        } else {
+                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+                level = root_item->drop_level;
+                BUG_ON(level == 0);
+                path->lowest_level = level;
+                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+                if (ret < 0) {
+                        btrfs_free_path(path);
+                        return ret;
+                }
+                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+                                      path->slots[level]);
+                WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+                btrfs_unlock_up_safe(path, 0);
+        }
+        if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+                trans = btrfs_start_transaction(root, 1);
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, 0);
+                btrfs_release_path(reloc_root, path);
+                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                leaf = path->nodes[0];
+                btrfs_unlock_up_safe(path, 1);
+                ret = replace_file_extents(trans, rc, root, leaf,
+                                           &inode_list);
+                if (ret < 0)
+                        err = ret;
+                goto out;
+        }
+        memset(&next_key, 0, sizeof(next_key));
+        while (1) {
+                leaf = NULL;
+                replaced = 0;
+                trans = btrfs_start_transaction(root, 1);
+                max_level = level;
+                ret = walk_down_reloc_tree(reloc_root, path, &level);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0)
+                        break;
+                if (!find_next_key(path, level, &key) &&
+                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+                        ret = 0;
+                } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
+                        ret = replace_path(trans, root, reloc_root,
+                                           path, &next_key, &leaf,
+                                           level, max_level);
+                } else {
+                        ret = replace_path(trans, root, reloc_root,
+                                           path, &next_key, NULL,
+                                           level, max_level);
+                }
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0) {
+                        level = ret;
+                        btrfs_node_key_to_cpu(path->nodes[level], &key,
+                                              path->slots[level]);
+                        replaced = 1;
+                } else if (leaf) {
+                        /*
+                         * no block got replaced, try replacing file extents
+                         */
+                        btrfs_item_key_to_cpu(leaf, &key, 0);
+                        ret = replace_file_extents(trans, rc, root, leaf,
+                                                   &inode_list);
+                        btrfs_tree_unlock(leaf);
+                        free_extent_buffer(leaf);
+                        BUG_ON(ret < 0);
+                }
+                ret = walk_up_reloc_tree(reloc_root, path, &level);
+                if (ret > 0)
+                        break;
+                BUG_ON(level == 0);
+                /*
+                 * save the merging progress in the drop_progress.
+                 * this is OK since root refs == 1 in this case.
+                 */
+                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+                               path->slots[level]);
+                root_item->drop_level = level;
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, root);
+                btrfs_btree_balance_dirty(root, nr);
+                if (replaced && rc->stage == UPDATE_DATA_PTRS)
+                        invalidate_extent_cache(root, &key, &next_key);
+        }
+        /*
+         * handle the case only one block in the fs tree need to be
+         * relocated and the block is tree root.
+         */
+        leaf = btrfs_lock_root_node(root);
+        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+        btrfs_tree_unlock(leaf);
+        free_extent_buffer(leaf);
+        if (ret < 0)
+                err = ret;
+out:
+        btrfs_free_path(path);
+        if (err == 0) {
+                memset(&root_item->drop_progress, 0,
+                       sizeof(root_item->drop_progress));
+                root_item->drop_level = 0;
+                btrfs_set_root_refs(root_item, 0);
+        }
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        /*
+         * put inodes while we aren't holding the tree locks
+         */
+        while (!list_empty(&inode_list)) {
+                struct inodevec *ivec;
+                ivec = list_entry(inode_list.next, struct inodevec, list);
+                list_del(&ivec->list);
+                while (ivec->nr > 0) {
+                        ivec->nr--;
+                        iput(ivec->inode[ivec->nr]);
+                }
+                kfree(ivec);
+        }
+        if (replaced && rc->stage == UPDATE_DATA_PTRS)
+                invalidate_extent_cache(root, &key, &next_key);
+        return err;
+}
+/*
+ * callback for the work threads.
+ * this function merges reloc tree with corresponding fs tree,
+ * and then drops the reloc tree.
+ */
+static void merge_func(struct btrfs_work *work)
+{
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        struct btrfs_root *reloc_root;
+        struct async_merge *async;
+        async = container_of(work, struct async_merge, work);
+        reloc_root = async->root;
+        if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+                root = read_fs_root(reloc_root->fs_info,
+                                    reloc_root->root_key.offset);
+                BUG_ON(IS_ERR(root));
+                BUG_ON(root->reloc_root != reloc_root);
+                merge_reloc_root(async->rc, root);
+                trans = btrfs_start_transaction(root, 1);
+                btrfs_update_reloc_root(trans, root);
+                btrfs_end_transaction(trans, root);
+        }
+        btrfs_drop_dead_root(reloc_root);
+        if (atomic_dec_and_test(async->num_pending))
+                complete(async->done);
+        kfree(async);
+}
+static int merge_reloc_roots(struct reloc_control *rc)
+{
+        struct async_merge *async;
+        struct btrfs_root *root;
+        struct completion done;
+        atomic_t num_pending;
+        init_completion(&done);
+        atomic_set(&num_pending, 1);
+        while (!list_empty(&rc->reloc_roots)) {
+                root = list_entry(rc->reloc_roots.next,
+                                  struct btrfs_root, root_list);
+                list_del_init(&root->root_list);
+                async = kmalloc(sizeof(*async), GFP_NOFS);
+                BUG_ON(!async);
+                async->work.func = merge_func;
+                async->work.flags = 0;
+                async->rc = rc;
+                async->root = root;
+                async->done = &done;
+                async->num_pending = &num_pending;
+                atomic_inc(&num_pending);
+                btrfs_queue_worker(&rc->workers, &async->work);
+        }
+        if (!atomic_dec_and_test(&num_pending))
+                wait_for_completion(&done);
+        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+        return 0;
+}
+static void free_block_list(struct rb_root *blocks)
+{
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        while ((rb_node = rb_first(blocks))) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                rb_erase(rb_node, blocks);
+                kfree(block);
+        }
+}
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *reloc_root)
+{
+        struct btrfs_root *root;
+        if (reloc_root->last_trans == trans->transid)
+                return 0;
+        root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+        BUG_ON(IS_ERR(root));
+        BUG_ON(root->reloc_root != reloc_root);
+        return btrfs_record_root_in_trans(trans, root);
+}
+/*
+ * select one tree from trees that references the block.
+ * for blocks in refernce counted trees, we preper reloc tree.
+ * if no reloc tree found and reloc_only is true, NULL is returned.
+ */
+static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
+                                            struct backref_node *node,
+                                            struct backref_edge *edges[],
+                                            int *nr, int reloc_only)
+{
+        struct backref_node *next;
+        struct btrfs_root *root;
+        int index;
+        int loop = 0;
+again:
+        index = 0;
+        next = node;
+        while (1) {
+                cond_resched();
+                next = walk_up_backref(next, edges, &index);
+                root = next->root;
+                if (!root) {
+                        BUG_ON(!node->old_root);
+                        goto skip;
+                }
+                /* no other choice for non-refernce counted tree */
+                if (!root->ref_cows) {
+                        BUG_ON(reloc_only);
+                        break;
+                }
+                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                        record_reloc_root_in_trans(trans, root);
+                        break;
+                }
+                if (loop) {
+                        btrfs_record_root_in_trans(trans, root);
+                        break;
+                }
+                if (reloc_only || next != node) {
+                        if (!root->reloc_root)
+                                btrfs_record_root_in_trans(trans, root);
+                        root = root->reloc_root;
+                        /*
+                         * if the reloc tree was created in current
+                         * transation, there is no node in backref tree
+                         * corresponds to the root of the reloc tree.
+                         */
+                        if (btrfs_root_last_snapshot(&root->root_item) ==
+                            trans->transid - 1)
+                                break;
+                }
+skip:
+                root = NULL;
+                next = walk_down_backref(edges, &index);
+                if (!next || next->level <= node->level)
+                        break;
+        }
+        if (!root && !loop && !reloc_only) {
+                loop = 1;
+                goto again;
+        }
+        if (root)
+                *nr = index;
+        else
+                *nr = 0;
+        return root;
+}
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
+                                   struct backref_node *node)
+{
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        int nr;
+        return __select_one_root(trans, node, edges, &nr, 0);
+}
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+                                     struct backref_node *node,
+                                     struct backref_edge *edges[], int *nr)
+{
+        return __select_one_root(trans, node, edges, nr, 1);
+}
+static void grab_path_buffers(struct btrfs_path *path,
+                              struct backref_node *node,
+                              struct backref_edge *edges[], int nr)
+{
+        int i = 0;
+        while (1) {
+                drop_node_buffer(node);
+                node->eb = path->nodes[node->level];
+                BUG_ON(!node->eb);
+                if (path->locks[node->level])
+                        node->locked = 1;
+                path->nodes[node->level] = NULL;
+                path->locks[node->level] = 0;
+                if (i >= nr)
+                        break;
+                edges[i]->blockptr = node->eb->start;
+                node = edges[i]->node[UPPER];
+                i++;
+        }
+}
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+                         struct backref_node *node,
+                         struct btrfs_key *key,
+                         struct btrfs_path *path, int lowest)
+{
+        struct backref_node *upper;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        struct btrfs_root *root;
+        struct extent_buffer *eb;
+        u32 blocksize;
+        u64 bytenr;
+        u64 generation;
+        int nr;
+        int slot;
+        int ret;
+        int err = 0;
+        BUG_ON(lowest && node->eb);
+        path->lowest_level = node->level + 1;
+        list_for_each_entry(edge, &node->upper, list[LOWER]) {
+                cond_resched();
+                if (node->eb && node->eb->start == edge->blockptr)
+                        continue;
+                upper = edge->node[UPPER];
+                root = select_reloc_root(trans, upper, edges, &nr);
+                if (!root)
+                        continue;
+                if (upper->eb && !upper->locked)
+                        drop_node_buffer(upper);
+                if (!upper->eb) {
+                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        BUG_ON(ret > 0);
+                        slot = path->slots[upper->level];
+                        btrfs_unlock_up_safe(path, upper->level + 1);
+                        grab_path_buffers(path, upper, edges, nr);
+                        btrfs_release_path(NULL, path);
+                } else {
+                        ret = btrfs_bin_search(upper->eb, key, upper->level,
+                                               &slot);
+                        BUG_ON(ret);
+                }
+                bytenr = btrfs_node_blockptr(upper->eb, slot);
+                if (!lowest) {
+                        if (node->eb->start == bytenr) {
+                                btrfs_tree_unlock(upper->eb);
+                                upper->locked = 0;
+                                continue;
+                        }
+                } else {
+                        BUG_ON(node->bytenr != bytenr);
+                }
+                blocksize = btrfs_level_size(root, node->level);
+                generation = btrfs_node_ptr_generation(upper->eb, slot);
+                eb = read_tree_block(root, bytenr, blocksize, generation);
+                btrfs_tree_lock(eb);
+                btrfs_set_lock_blocking(eb);
+                if (!node->eb) {
+                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
+                                              slot, &eb);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        btrfs_set_lock_blocking(eb);
+                        node->eb = eb;
+                        node->locked = 1;
+                } else {
+                        btrfs_set_node_blockptr(upper->eb, slot,
+                                                node->eb->start);
+                        btrfs_set_node_ptr_generation(upper->eb, slot,
+                                                      trans->transid);
+                        btrfs_mark_buffer_dirty(upper->eb);
+                        ret = btrfs_inc_extent_ref(trans, root,
+                                                node->eb->start, blocksize,
+                                                upper->eb->start,
+                                                btrfs_header_owner(upper->eb),
+                                                node->level, 0);
+                        BUG_ON(ret);
+                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
+                        BUG_ON(ret);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
+                }
+                if (!lowest) {
+                        btrfs_tree_unlock(upper->eb);
+                        upper->locked = 0;
+                }
+        }
+        path->lowest_level = 0;
+        return err;
+}
+static int link_to_upper(struct btrfs_trans_handle *trans,
+                         struct backref_node *node,
+                         struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        if (!node->eb || list_empty(&node->upper))
+                return 0;
+        btrfs_node_key_to_cpu(node->eb, &key, 0);
+        return do_relocation(trans, node, &key, path, 0);
+}
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+                                struct backref_cache *cache,
+                                struct btrfs_path *path)
+{
+        struct backref_node *node;
+        int level;
+        int ret;
+        int err = 0;
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                while (!list_empty(&cache->pending[level])) {
+                        node = list_entry(cache->pending[level].next,
+                                          struct backref_node, lower);
+                        BUG_ON(node->level != level);
+                        ret = link_to_upper(trans, node, path);
+                        if (ret < 0)
+                                err = ret;
+                        /*
+                         * this remove the node from the pending list and
+                         * may add some other nodes to the level + 1
+                         * pending list
+                         */
+                        remove_backref_node(cache, node);
+                }
+        }
+        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+        return err;
+}
+static void mark_block_processed(struct reloc_control *rc,
+                                 struct backref_node *node)
+{
+        u32 blocksize;
+        if (node->level == 0 ||
+            in_block_group(node->bytenr, rc->block_group)) {
+                blocksize = btrfs_level_size(rc->extent_root, node->level);
+                set_extent_bits(&rc->processed_blocks, node->bytenr,
+                                node->bytenr + blocksize - 1, EXTENT_DIRTY,
+                                GFP_NOFS);
+        }
+        node->processed = 1;
+}
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+                                    struct backref_node *node)
+{
+        struct backref_node *next = node;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        int index = 0;
+        while (next) {
+                cond_resched();
+                while (1) {
+                        if (next->processed)
+                                break;
+                        mark_block_processed(rc, next);
+                        if (list_empty(&next->upper))
+                                break;
+                        edge = list_entry(next->upper.next,
+                                          struct backref_edge, list[LOWER]);
+                        edges[index++] = edge;
+                        next = edge->node[UPPER];
+                }
+                next = walk_down_backref(edges, &index);
+        }
+}
+static int tree_block_processed(u64 bytenr, u32 blocksize,
+                                struct reloc_control *rc)
+{
+        if (test_range_bit(&rc->processed_blocks, bytenr,
+                           bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+                return 1;
+        return 0;
+}
+/*
+ * check if there are any file extent pointers in the leaf point to
+ * data require processing
+ */
+static int check_file_extents(struct reloc_control *rc,
+                              u64 bytenr, u32 blocksize, u64 ptr_gen)
+{
+        struct btrfs_key found_key;
+        struct btrfs_file_extent_item *fi;
+        struct extent_buffer *leaf;
+        u32 nritems;
+        int i;
+        int ret = 0;
+        leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
+        nritems = btrfs_header_nritems(leaf);
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                btrfs_item_key_to_cpu(leaf, &found_key, i);
+                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        continue;
+                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                if (bytenr == 0)
+                        continue;
+                if (in_block_group(bytenr, rc->block_group)) {
+                        ret = 1;
+                        break;
+                }
+        }
+        free_extent_buffer(leaf);
+        return ret;
+}
+/*
+ * scan child blocks of a given block to find blocks require processing
+ */
+static int add_child_blocks(struct btrfs_trans_handle *trans,
+                            struct reloc_control *rc,
+                            struct backref_node *node,
+                            struct rb_root *blocks)
+{
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        u64 bytenr;
+        u64 ptr_gen;
+        u32 blocksize;
+        u32 nritems;
+        int i;
+        int err = 0;
+        nritems = btrfs_header_nritems(node->eb);
+        blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                bytenr = btrfs_node_blockptr(node->eb, i);
+                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+                if (ptr_gen == trans->transid)
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group) &&
+                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+                        continue;
+                if (tree_block_processed(bytenr, blocksize, rc))
+                        continue;
+                readahead_tree_block(rc->extent_root,
+                                     bytenr, blocksize, ptr_gen);
+        }
+        for (i = 0; i < nritems; i++) {
+                cond_resched();
+                bytenr = btrfs_node_blockptr(node->eb, i);
+                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+                if (ptr_gen == trans->transid)
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group) &&
+                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+                        continue;
+                if (tree_block_processed(bytenr, blocksize, rc))
+                        continue;
+                if (!in_block_group(bytenr, rc->block_group) &&
+                    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
+                        continue;
+                block = kmalloc(sizeof(*block), GFP_NOFS);
+                if (!block) {
+                        err = -ENOMEM;
+                        break;
+                }
+                block->bytenr = bytenr;
+                btrfs_node_key_to_cpu(node->eb, &block->key, i);
+                block->level = node->level - 1;
+                block->key_ready = 1;
+                rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+                BUG_ON(rb_node);
+        }
+        if (err)
+                free_block_list(blocks);
+        return err;
+}
+/*
+ * find adjacent blocks require processing
+ */
+static noinline_for_stack
+int add_adjacent_blocks(struct btrfs_trans_handle *trans,
+                        struct reloc_control *rc,
+                        struct backref_cache *cache,
+                        struct rb_root *blocks, int level,
+                        struct backref_node **upper)
+{
+        struct backref_node *node;
+        int ret = 0;
+        WARN_ON(!list_empty(&cache->pending[level]));
+        if (list_empty(&cache->pending[level + 1]))
+                return 1;
+        node = list_entry(cache->pending[level + 1].next,
+                          struct backref_node, lower);
+        if (node->eb)
+                ret = add_child_blocks(trans, rc, node, blocks);
+        *upper = node;
+        return ret;
+}
+static int get_tree_block_key(struct reloc_control *rc,
+                              struct tree_block *block)
+{
+        struct extent_buffer *eb;
+        BUG_ON(block->key_ready);
+        eb = read_tree_block(rc->extent_root, block->bytenr,
+                             block->key.objectid, block->key.offset);
+        WARN_ON(btrfs_header_level(eb) != block->level);
+        if (block->level == 0)
+                btrfs_item_key_to_cpu(eb, &block->key, 0);
+        else
+                btrfs_node_key_to_cpu(eb, &block->key, 0);
+        free_extent_buffer(eb);
+        block->key_ready = 1;
+        return 0;
+}
+static int reada_tree_block(struct reloc_control *rc,
+                            struct tree_block *block)
+{
+        BUG_ON(block->key_ready);
+        readahead_tree_block(rc->extent_root, block->bytenr,
+                             block->key.objectid, block->key.offset);
+        return 0;
+}
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+                                struct reloc_control *rc,
+                                struct backref_node *node,
+                                struct btrfs_key *key,
+                                struct btrfs_path *path)
+{
+        struct btrfs_root *root;
+        int ret;
+        root = select_one_root(trans, node);
+        if (unlikely(!root)) {
+                rc->found_old_snapshot = 1;
+                update_processed_blocks(rc, node);
+                return 0;
+        }
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                ret = do_relocation(trans, node, key, path, 1);
+                if (ret < 0)
+                        goto out;
+                if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+                        ret = replace_file_extents(trans, rc, root,
+                                                   node->eb, NULL);
+                        if (ret < 0)
+                                goto out;
+                }
+                drop_node_buffer(node);
+        } else if (!root->ref_cows) {
+                path->lowest_level = node->level;
+                ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                btrfs_release_path(root, path);
+                if (ret < 0)
+                        goto out;
+        } else if (root != node->root) {
+                WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+        }
+        update_processed_blocks(rc, node);
+        ret = 0;
+out:
+        drop_node_buffer(node);
+        return ret;
+}
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc, struct rb_root *blocks)
+{
+        struct backref_cache *cache;
+        struct backref_node *node;
+        struct btrfs_path *path;
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        int level = -1;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        cache = kmalloc(sizeof(*cache), GFP_NOFS);
+        if (!cache) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
+        backref_cache_init(cache);
+        rb_node = rb_first(blocks);
+        while (rb_node) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                if (level == -1)
+                        level = block->level;
+                else
+                        BUG_ON(level != block->level);
+                if (!block->key_ready)
+                        reada_tree_block(rc, block);
+                rb_node = rb_next(rb_node);
+        }
+        rb_node = rb_first(blocks);
+        while (rb_node) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                if (!block->key_ready)
+                        get_tree_block_key(rc, block);
+                rb_node = rb_next(rb_node);
+        }
+        rb_node = rb_first(blocks);
+        while (rb_node) {
+                block = rb_entry(rb_node, struct tree_block, rb_node);
+                node = build_backref_tree(rc, cache, &block->key,
+                                          block->level, block->bytenr);
+                if (IS_ERR(node)) {
+                        err = PTR_ERR(node);
+                        goto out;
+                }
+                ret = relocate_tree_block(trans, rc, node, &block->key,
+                                          path);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                remove_backref_node(cache, node);
+                rb_node = rb_next(rb_node);
+        }
+        if (level > 0)
+                goto out;
+        free_block_list(blocks);
+        /*
+         * now backrefs of some upper level tree blocks have been cached,
+         * try relocating blocks referenced by these upper level blocks.
+         */
+        while (1) {
+                struct backref_node *upper = NULL;
+                if (trans->transaction->in_commit ||
+                    trans->transaction->delayed_refs.flushing)
+                        break;
+                ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+                                          &upper);
+                if (ret < 0)
+                        err = ret;
+                if (ret != 0)
+                        break;
+                rb_node = rb_first(blocks);
+                while (rb_node) {
+                        block = rb_entry(rb_node, struct tree_block, rb_node);
+                        if (trans->transaction->in_commit ||
+                            trans->transaction->delayed_refs.flushing)
+                                goto out;
+                        BUG_ON(!block->key_ready);
+                        node = build_backref_tree(rc, cache, &block->key,
+                                                  level, block->bytenr);
+                        if (IS_ERR(node)) {
+                                err = PTR_ERR(node);
+                                goto out;
+                        }
+                        ret = relocate_tree_block(trans, rc, node,
+                                                  &block->key, path);
+                        if (ret < 0) {
+                                err = ret;
+                                goto out;
+                        }
+                        remove_backref_node(cache, node);
+                        rb_node = rb_next(rb_node);
+                }
+                free_block_list(blocks);
+                if (upper) {
+                        ret = link_to_upper(trans, upper, path);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        remove_backref_node(cache, upper);
+                }
+        }
+out:
+        free_block_list(blocks);
+        ret = finish_pending_nodes(trans, cache, path);
+        if (ret < 0)
+                err = ret;
+        kfree(cache);
+        btrfs_free_path(path);
+        return err;
+}
+static noinline_for_stack
+int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+{
+        u64 page_start;
+        u64 page_end;
+        unsigned long i;
+        unsigned long first_index;
+        unsigned long last_index;
+        unsigned int total_read = 0;
+        unsigned int total_dirty = 0;
+        struct page *page;
+        struct file_ra_state *ra;
+        struct btrfs_ordered_extent *ordered;
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        int ret = 0;
+        ra = kzalloc(sizeof(*ra), GFP_NOFS);
+        if (!ra)
+                return -ENOMEM;
+        mutex_lock(&inode->i_mutex);
+        first_index = start >> PAGE_CACHE_SHIFT;
+        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+        /* make sure the dirty trick played by the caller work */
+        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                            first_index, last_index);
+        if (ret)
+                goto out_unlock;
+        file_ra_state_init(ra, inode->i_mapping);
+        for (i = first_index ; i <= last_index; i++) {
+                if (total_read % ra->ra_pages == 0) {
+                        btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+                                min(last_index, ra->ra_pages + i - 1));
+                }
+                total_read++;
+again:
+                if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+                        BUG_ON(1);
+                page = grab_cache_page(inode->i_mapping, i);
+                if (!page) {
+                        ret = -ENOMEM;
+                        goto out_unlock;
+                }
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                ret = -EIO;
+                                goto out_unlock;
+                        }
+                }
+                wait_on_page_writeback(page);
+                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+                page_end = page_start + PAGE_CACHE_SIZE - 1;
+                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, page_start);
+                if (ordered) {
+                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                        goto again;
+                }
+                set_page_extent_mapped(page);
+                if (i == first_index)
+                        set_extent_bits(io_tree, page_start, page_end,
+                                        EXTENT_BOUNDARY, GFP_NOFS);
+                btrfs_set_extent_delalloc(inode, page_start, page_end);
+                set_page_dirty(page);
+                total_dirty++;
+                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                unlock_page(page);
+                page_cache_release(page);
+        }
+out_unlock:
+        mutex_unlock(&inode->i_mutex);
+        kfree(ra);
+        balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+        return ret;
+}
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
+        u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
+        u64 end = start + extent_key->offset - 1;
+        em = alloc_extent_map(GFP_NOFS);
+        em->start = start;
+        em->len = extent_key->offset;
+        em->block_len = extent_key->offset;
+        em->block_start = extent_key->objectid;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        /* setup extent map to cheat btrfs_readpage */
+        lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+        while (1) {
+                int ret;
+                spin_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                spin_unlock(&em_tree->lock);
+                if (ret != -EEXIST) {
+                        free_extent_map(em);
+                        break;
+                }
+                btrfs_drop_extent_cache(inode, start, end, 0);
+        }
+        unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+        return relocate_inode_pages(inode, start, extent_key->offset);
+}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int get_ref_objectid_v0(struct reloc_control *rc,
+                               struct btrfs_path *path,
+                               struct btrfs_key *extent_key,
+                               u64 *ref_objectid, int *path_change)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_ref_v0 *ref0;
+        int ret;
+        int slot;
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        while (1) {
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(rc->extent_root, path);
+                        if (ret < 0)
+                                return ret;
+                        BUG_ON(ret > 0);
+                        leaf = path->nodes[0];
+                        slot = path->slots[0];
+                        if (path_change)
+                                *path_change = 1;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.objectid != extent_key->objectid)
+                        return -ENOENT;
+                if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
+                        slot++;
+                        continue;
+                }
+                ref0 = btrfs_item_ptr(leaf, slot,
+                                struct btrfs_extent_ref_v0);
+                *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
+                break;
+        }
+        return 0;
+}
+#endif
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+                          struct btrfs_key *extent_key,
+                          struct btrfs_path *path,
+                          struct rb_root *blocks)
+{
+        struct extent_buffer *eb;
+        struct btrfs_extent_item *ei;
+        struct btrfs_tree_block_info *bi;
+        struct tree_block *block;
+        struct rb_node *rb_node;
+        u32 item_size;
+        int level = -1;
+        int generation;
+        eb =  path->nodes[0];
+        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+        if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+                ei = btrfs_item_ptr(eb, path->slots[0],
+                                struct btrfs_extent_item);
+                bi = (struct btrfs_tree_block_info *)(ei + 1);
+                generation = btrfs_extent_generation(eb, ei);
+                level = btrfs_tree_block_level(eb, bi);
+        } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                u64 ref_owner;
+                int ret;
+                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+                ret = get_ref_objectid_v0(rc, path, extent_key,
+                                          &ref_owner, NULL);
+                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
+                level = (int)ref_owner;
+                /* FIXME: get real generation */
+                generation = 0;
+#else
+                BUG();
+#endif
+        }
+        btrfs_release_path(rc->extent_root, path);
+        BUG_ON(level == -1);
+        block = kmalloc(sizeof(*block), GFP_NOFS);
+        if (!block)
+                return -ENOMEM;
+        block->bytenr = extent_key->objectid;
+        block->key.objectid = extent_key->offset;
+        block->key.offset = generation;
+        block->level = level;
+        block->key_ready = 0;
+        rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+        BUG_ON(rb_node);
+        return 0;
+}
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+                            u64 bytenr, u32 blocksize,
+                            struct rb_root *blocks)
+{
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        int ret;
+        if (tree_block_processed(bytenr, blocksize, rc))
+                return 0;
+        if (tree_search(blocks, bytenr))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = blocksize;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        BUG_ON(ret);
+        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+        ret = add_tree_block(rc, &key, path, blocks);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to check if the block use full backrefs for pointers in it
+ */
+static int block_use_full_backref(struct reloc_control *rc,
+                                  struct extent_buffer *eb)
+{
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct btrfs_key key;
+        u64 flags;
+        int ret;
+        if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
+            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
+                return 1;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
+        key.objectid = eb->start;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = eb->len;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, rc->extent_root,
+                                &key, path, 0, 0);
+        BUG_ON(ret);
+        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                            struct btrfs_extent_item);
+        flags = btrfs_extent_flags(path->nodes[0], ei);
+        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+                ret = 1;
+        else
+                ret = 0;
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
+ * this function scans fs tree to find blocks reference the data extent
+ */
+static int find_data_references(struct reloc_control *rc,
+                                struct btrfs_key *extent_key,
+                                struct extent_buffer *leaf,
+                                struct btrfs_extent_data_ref *ref,
+                                struct rb_root *blocks)
+{
+        struct btrfs_path *path;
+        struct tree_block *block;
+        struct btrfs_root *root;
+        struct btrfs_file_extent_item *fi;
+        struct rb_node *rb_node;
+        struct btrfs_key key;
+        u64 ref_root;
+        u64 ref_objectid;
+        u64 ref_offset;
+        u32 ref_count;
+        u32 nritems;
+        int err = 0;
+        int added = 0;
+        int counted;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ref_root = btrfs_extent_data_ref_root(leaf, ref);
+        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
+        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
+        ref_count = btrfs_extent_data_ref_count(leaf, ref);
+        root = read_fs_root(rc->extent_root->fs_info, ref_root);
+        if (IS_ERR(root)) {
+                err = PTR_ERR(root);
+                goto out;
+        }
+        key.objectid = ref_objectid;
+        key.offset = ref_offset;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        leaf = path->nodes[0];
+        nritems = btrfs_header_nritems(leaf);
+        /*
+         * the references in tree blocks that use full backrefs
+         * are not counted in
+         */
+        if (block_use_full_backref(rc, leaf))
+                counted = 0;
+        else
+                counted = 1;
+        rb_node = tree_search(blocks, leaf->start);
+        if (rb_node) {
+                if (counted)
+                        added = 1;
+                else
+                        path->slots[0] = nritems;
+        }
+        while (ref_count > 0) {
+                while (path->slots[0] >= nritems) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0) {
+                                err = ret;
+                                goto out;
+                        }
+                        if (ret > 0) {
+                                WARN_ON(1);
+                                goto out;
+                        }
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        added = 0;
+                        if (block_use_full_backref(rc, leaf))
+                                counted = 0;
+                        else
+                                counted = 1;
+                        rb_node = tree_search(blocks, leaf->start);
+                        if (rb_node) {
+                                if (counted)
+                                        added = 1;
+                                else
+                                        path->slots[0] = nritems;
+                        }
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid != ref_objectid ||
+                    key.type != BTRFS_EXTENT_DATA_KEY) {
+                        WARN_ON(1);
+                        break;
+                }
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (btrfs_file_extent_type(leaf, fi) ==
+                    BTRFS_FILE_EXTENT_INLINE)
+                        goto next;
+                if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+                    extent_key->objectid)
+                        goto next;
+                key.offset -= btrfs_file_extent_offset(leaf, fi);
+                if (key.offset != ref_offset)
+                        goto next;
+                if (counted)
+                        ref_count--;
+                if (added)
+                        goto next;
+                if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+                        block = kmalloc(sizeof(*block), GFP_NOFS);
+                        if (!block) {
+                                err = -ENOMEM;
+                                break;
+                        }
+                        block->bytenr = leaf->start;
+                        btrfs_item_key_to_cpu(leaf, &block->key, 0);
+                        block->level = 0;
+                        block->key_ready = 1;
+                        rb_node = tree_insert(blocks, block->bytenr,
+                                              &block->rb_node);
+                        BUG_ON(rb_node);
+                }
+                if (counted)
+                        added = 1;
+                else
+                        path->slots[0] = nritems;
+next:
+                path->slots[0]++;
+        }
+out:
+        btrfs_free_path(path);
+        return err;
+}
+/*
+ * hepler to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+                        struct btrfs_key *extent_key,
+                        struct btrfs_path *path,
+                        struct rb_root *blocks)
+{
+        struct btrfs_key key;
+        struct extent_buffer *eb;
+        struct btrfs_extent_data_ref *dref;
+        struct btrfs_extent_inline_ref *iref;
+        unsigned long ptr;
+        unsigned long end;
+        u32 blocksize;
+        int ret;
+        int err = 0;
+        ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
+                               extent_key->offset);
+        BUG_ON(ret < 0);
+        if (ret > 0) {
+                /* the relocated data is fragmented */
+                rc->extents_skipped++;
+                btrfs_release_path(rc->extent_root, path);
+                return 0;
+        }
+        blocksize = btrfs_level_size(rc->extent_root, 0);
+        eb = path->nodes[0];
+        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+        if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
+                ptr = end;
+        else
+#endif
+                ptr += sizeof(struct btrfs_extent_item);
+        while (ptr < end) {
+                iref = (struct btrfs_extent_inline_ref *)ptr;
+                key.type = btrfs_extent_inline_ref_type(eb, iref);
+                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+                        ret = __add_tree_block(rc, key.offset, blocksize,
+                                               blocks);
+                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                        ret = find_data_references(rc, extent_key,
+                                                   eb, dref, blocks);
+                } else {
+                        BUG();
+                }
+                ptr += btrfs_extent_inline_ref_size(key.type);
+        }
+        WARN_ON(ptr > end);
+        while (1) {
+                cond_resched();
+                eb = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(eb)) {
+                        ret = btrfs_next_leaf(rc->extent_root, path);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                        if (ret > 0)
+                                break;
+                        eb = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+                if (key.objectid != extent_key->objectid)
+                        break;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
+                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+#else
+                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+#endif
+                        ret = __add_tree_block(rc, key.offset, blocksize,
+                                               blocks);
+                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+                        dref = btrfs_item_ptr(eb, path->slots[0],
+                                              struct btrfs_extent_data_ref);
+                        ret = find_data_references(rc, extent_key,
+                                                   eb, dref, blocks);
+                } else {
+                        ret = 0;
+                }
+                if (ret) {
+                        err = ret;
+                        break;
+                }
+                path->slots[0]++;
+        }
+        btrfs_release_path(rc->extent_root, path);
+        if (err)
+                free_block_list(blocks);
+        return err;
+}
+/*
+ * hepler to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct btrfs_trans_handle *trans,
+                     struct reloc_control *rc, struct btrfs_path *path)
+{
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        u64 start, end, last;
+        int ret;
+        last = rc->block_group->key.objectid + rc->block_group->key.offset;
+        while (1) {
+                cond_resched();
+                if (rc->search_start >= last) {
+                        ret = 1;
+                        break;
+                }
+                key.objectid = rc->search_start;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = 0;
+                path->search_commit_root = 1;
+                path->skip_locking = 1;
+                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+                                        0, 0);
+                if (ret < 0)
+                        break;
+next:
+                leaf = path->nodes[0];
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(rc->extent_root, path);
+                        if (ret != 0)
+                                break;
+                        leaf = path->nodes[0];
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid >= last) {
+                        ret = 1;
+                        break;
+                }
+                if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+                    key.objectid + key.offset <= rc->search_start) {
+                        path->slots[0]++;
+                        goto next;
+                }
+                ret = find_first_extent_bit(&rc->processed_blocks,
+                                            key.objectid, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret == 0 && start <= key.objectid) {
+                        btrfs_release_path(rc->extent_root, path);
+                        rc->search_start = end + 1;
+                } else {
+                        rc->search_start = key.objectid + key.offset;
+                        return 0;
+                }
+        }
+        btrfs_release_path(rc->extent_root, path);
+        return ret;
+}
+static void set_reloc_control(struct reloc_control *rc)
+{
+        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+        mutex_lock(&fs_info->trans_mutex);
+        fs_info->reloc_ctl = rc;
+        mutex_unlock(&fs_info->trans_mutex);
+}
+static void unset_reloc_control(struct reloc_control *rc)
+{
+        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+        mutex_lock(&fs_info->trans_mutex);
+        fs_info->reloc_ctl = NULL;
+        mutex_unlock(&fs_info->trans_mutex);
+}
+static int check_extent_flags(u64 flags)
+{
+        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+                return 1;
+        if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
+            !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+                return 1;
+        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+                return 1;
+        return 0;
+}
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+        struct rb_root blocks = RB_ROOT;
+        struct btrfs_key key;
+        struct btrfs_trans_handle *trans = NULL;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        unsigned long nr;
+        u64 flags;
+        u32 item_size;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        rc->search_start = rc->block_group->key.objectid;
+        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+                          GFP_NOFS);
+        rc->create_reloc_root = 1;
+        set_reloc_control(rc);
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        while (1) {
+                trans = btrfs_start_transaction(rc->extent_root, 1);
+                ret = find_next_extent(trans, rc, path);
+                if (ret < 0)
+                        err = ret;
+                if (ret != 0)
+                        break;
+                rc->extents_found++;
+                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                    struct btrfs_extent_item);
+                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                item_size = btrfs_item_size_nr(path->nodes[0],
+                                               path->slots[0]);
+                if (item_size >= sizeof(*ei)) {
+                        flags = btrfs_extent_flags(path->nodes[0], ei);
+                        ret = check_extent_flags(flags);
+                        BUG_ON(ret);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        u64 ref_owner;
+                        int path_change = 0;
+                        BUG_ON(item_size !=
+                               sizeof(struct btrfs_extent_item_v0));
+                        ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
+                                                  &path_change);
+                        if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
+                                flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+                        else
+                                flags = BTRFS_EXTENT_FLAG_DATA;
+                        if (path_change) {
+                                btrfs_release_path(rc->extent_root, path);
+                                path->search_commit_root = 1;
+                                path->skip_locking = 1;
+                                ret = btrfs_search_slot(NULL, rc->extent_root,
+                                                        &key, path, 0, 0);
+                                if (ret < 0) {
+                                        err = ret;
+                                        break;
+                                }
+                                BUG_ON(ret > 0);
+                        }
+#else
+                        BUG();
+#endif
+                }
+                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                        ret = add_tree_block(rc, &key, path, &blocks);
+                } else if (rc->stage == UPDATE_DATA_PTRS &&
+                         (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                        ret = add_data_references(rc, &key, path, &blocks);
+                } else {
+                        btrfs_release_path(rc->extent_root, path);
+                        ret = 0;
+                }
+                if (ret < 0) {
+                        err = 0;
+                        break;
+                }
+                if (!RB_EMPTY_ROOT(&blocks)) {
+                        ret = relocate_tree_blocks(trans, rc, &blocks);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                }
+                nr = trans->blocks_used;
+                btrfs_end_transaction_throttle(trans, rc->extent_root);
+                trans = NULL;
+                btrfs_btree_balance_dirty(rc->extent_root, nr);
+                if (rc->stage == MOVE_DATA_EXTENTS &&
+                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                        rc->found_file_extent = 1;
+                        ret = relocate_data_extent(rc->data_inode, &key);
+                        if (ret < 0) {
+                                err = ret;
+                                break;
+                        }
+                }
+        }
+        btrfs_free_path(path);
+        if (trans) {
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, rc->extent_root);
+                btrfs_btree_balance_dirty(rc->extent_root, nr);
+        }
+        rc->create_reloc_root = 0;
+        smp_mb();
+        if (rc->extents_found > 0) {
+                trans = btrfs_start_transaction(rc->extent_root, 1);
+                btrfs_commit_transaction(trans, rc->extent_root);
+        }
+        merge_reloc_roots(rc);
+        unset_reloc_control(rc);
+        /* get rid of pinned extents */
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        return err;
+}
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 u64 objectid, u64 size)
+{
+        struct btrfs_path *path;
+        struct btrfs_inode_item *item;
+        struct extent_buffer *leaf;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+        if (ret)
+                goto out;
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+        btrfs_set_inode_generation(leaf, item, 1);
+        btrfs_set_inode_size(leaf, item, size);
+        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(root, path);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_block_group_cache *group)
+{
+        struct inode *inode = NULL;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root;
+        struct btrfs_key key;
+        unsigned long nr;
+        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+        int err = 0;
+        root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+        if (IS_ERR(root))
+                return ERR_CAST(root);
+        trans = btrfs_start_transaction(root, 1);
+        BUG_ON(!trans);
+        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+        if (err)
+                goto out;
+        err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+        BUG_ON(err);
+        err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+                                       group->key.offset, 0, group->key.offset,
+                                       0, 0, 0);
+        BUG_ON(err);
+        key.objectid = objectid;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(root->fs_info->sb, &key, root);
+        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
+        BTRFS_I(inode)->index_cnt = group->key.objectid;
+        err = btrfs_orphan_add(trans, inode);
+out:
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        if (err) {
+                if (inode)
+                        iput(inode);
+                inode = ERR_PTR(err);
+        }
+        return inode;
+}
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+{
+        struct btrfs_fs_info *fs_info = extent_root->fs_info;
+        struct reloc_control *rc;
+        int ret;
+        int err = 0;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc)
+                return -ENOMEM;
+        mapping_tree_init(&rc->reloc_root_tree);
+        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
+        BUG_ON(!rc->block_group);
+        btrfs_init_workers(&rc->workers, "relocate",
+                           fs_info->thread_pool_size);
+        rc->extent_root = extent_root;
+        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+        if (IS_ERR(rc->data_inode)) {
+                err = PTR_ERR(rc->data_inode);
+                rc->data_inode = NULL;
+                goto out;
+        }
+        printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+               (unsigned long long)rc->block_group->key.objectid,
+               (unsigned long long)rc->block_group->flags);
+        btrfs_start_delalloc_inodes(fs_info->tree_root);
+        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+        while (1) {
+                mutex_lock(&fs_info->cleaner_mutex);
+                btrfs_clean_old_snapshots(fs_info->tree_root);
+                mutex_unlock(&fs_info->cleaner_mutex);
+                rc->extents_found = 0;
+                rc->extents_skipped = 0;
+                ret = relocate_block_group(rc);
+                if (ret < 0) {
+                        err = ret;
+                        break;
+                }
+                if (rc->extents_found == 0)
+                        break;
+                printk(KERN_INFO "btrfs: found %llu extents\n",
+                        (unsigned long long)rc->extents_found);
+                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+                        btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+                        invalidate_mapping_pages(rc->data_inode->i_mapping,
+                                                 0, -1);
+                        rc->stage = UPDATE_DATA_PTRS;
+                } else if (rc->stage == UPDATE_DATA_PTRS &&
+                           rc->extents_skipped >= rc->extents_found) {
+                        iput(rc->data_inode);
+                        rc->data_inode = create_reloc_inode(fs_info,
+                                                            rc->block_group);
+                        if (IS_ERR(rc->data_inode)) {
+                                err = PTR_ERR(rc->data_inode);
+                                rc->data_inode = NULL;
+                                break;
+                        }
+                        rc->stage = MOVE_DATA_EXTENTS;
+                        rc->found_file_extent = 0;
+                }
+        }
+        filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
+                                 rc->block_group->key.objectid,
+                                 rc->block_group->key.objectid +
+                                 rc->block_group->key.offset - 1);
+        WARN_ON(rc->block_group->pinned > 0);
+        WARN_ON(rc->block_group->reserved > 0);
+        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+out:
+        iput(rc->data_inode);
+        btrfs_stop_workers(&rc->workers);
+        btrfs_put_block_group(rc->block_group);
+        kfree(rc);
+        return err;
+}
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_root *root)
+{
+        LIST_HEAD(reloc_roots);
+        struct btrfs_key key;
+        struct btrfs_root *fs_root;
+        struct btrfs_root *reloc_root;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct reloc_control *rc = NULL;
+        struct btrfs_trans_handle *trans;
+        int ret;
+        int err = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        key.offset = (u64)-1;
+        while (1) {
+                ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+                                        path, 0, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                if (ret > 0) {
+                        if (path->slots[0] == 0)
+                                break;
+                        path->slots[0]--;
+                }
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                btrfs_release_path(root->fs_info->tree_root, path);
+                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+                    key.type != BTRFS_ROOT_ITEM_KEY)
+                        break;
+                reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+                if (IS_ERR(reloc_root)) {
+                        err = PTR_ERR(reloc_root);
+                        goto out;
+                }
+                list_add(&reloc_root->root_list, &reloc_roots);
+                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+                        fs_root = read_fs_root(root->fs_info,
+                                               reloc_root->root_key.offset);
+                        if (IS_ERR(fs_root)) {
+                                err = PTR_ERR(fs_root);
+                                goto out;
+                        }
+                }
+                if (key.offset == 0)
+                        break;
+                key.offset--;
+        }
+        btrfs_release_path(root->fs_info->tree_root, path);
+        if (list_empty(&reloc_roots))
+                goto out;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc) {
+                err = -ENOMEM;
+                goto out;
+        }
+        mapping_tree_init(&rc->reloc_root_tree);
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        btrfs_init_workers(&rc->workers, "relocate",
+                           root->fs_info->thread_pool_size);
+        rc->extent_root = root->fs_info->extent_root;
+        set_reloc_control(rc);
+        while (!list_empty(&reloc_roots)) {
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del(&reloc_root->root_list);
+                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+                        list_add_tail(&reloc_root->root_list,
+                                      &rc->reloc_roots);
+                        continue;
+                }
+                fs_root = read_fs_root(root->fs_info,
+                                       reloc_root->root_key.offset);
+                BUG_ON(IS_ERR(fs_root));
+                __add_reloc_root(reloc_root);
+                fs_root->reloc_root = reloc_root;
+        }
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        merge_reloc_roots(rc);
+        unset_reloc_control(rc);
+        trans = btrfs_start_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+out:
+        if (rc) {
+                btrfs_stop_workers(&rc->workers);
+                kfree(rc);
+        }
+        while (!list_empty(&reloc_roots)) {
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del(&reloc_root->root_list);
+                free_extent_buffer(reloc_root->node);
+                free_extent_buffer(reloc_root->commit_root);
+                kfree(reloc_root);
+        }
+        btrfs_free_path(path);
+        if (err == 0) {
+                /* cleanup orphan inode in data relocation tree */
+                fs_root = read_fs_root(root->fs_info,
+                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
+                if (IS_ERR(fs_root))
+                        err = PTR_ERR(fs_root);
+        }
+        return err;
+}
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+        struct btrfs_ordered_sum *sums;
+        struct btrfs_sector_sum *sector_sum;
+        struct btrfs_ordered_extent *ordered;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        size_t offset;
+        int ret;
+        u64 disk_bytenr;
+        LIST_HEAD(list);
+        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+                                       disk_bytenr + len - 1, &list);
+        while (!list_empty(&list)) {
+                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+                list_del_init(&sums->list);
+                sector_sum = sums->sums;
+                sums->bytenr = ordered->start;
+                offset = 0;
+                while (offset < sums->len) {
+                        sector_sum->bytenr += ordered->start - disk_bytenr;
+                        sector_sum++;
+                        offset += root->sectorsize;
+                }
+                btrfs_add_ordered_sum(inode, ordered, sums);
+        }
+        btrfs_put_ordered_extent(ordered);
+        return 0;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index b48650de4472..0ddc6d61c55a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -111,6 +111,15 @@ out:
        return ret;
 }
+int btrfs_set_root_node(struct btrfs_root_item *item,
+                        struct extent_buffer *node)
+{
+        btrfs_set_root_bytenr(item, node->start);
+        btrfs_set_root_level(item, btrfs_header_level(node));
+        btrfs_set_root_generation(item, btrfs_header_generation(node));
+        return 0;
+}
 /*
 * copy the data in 'item' into the btree
 */
@@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 * offset lower than the latest root.  They need to be queued for deletion to
 * finish what was happening when we crashed.
 */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-                          struct btrfs_root *latest)
 {
        struct btrfs_root *dead_root;
        struct btrfs_item *item;
@@ -227,10 +235,7 @@ again:
                        goto err;
                }
-                if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+                ret = btrfs_add_dead_root(dead_root);
-                        ret = btrfs_add_dead_reloc_root(dead_root);
-                else
-                        ret = btrfs_add_dead_root(dead_root, latest);
                if (ret)
                        goto err;
                goto again;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2ff7cd2db25f..9f179d4832d5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,7 +52,6 @@
 #include "export.h"
 #include "compression.h"
 static struct super_operations btrfs_super_ops;
 static void btrfs_put_super(struct super_block *sb)
@@ -67,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
        Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
        Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-        Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
+        Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
-        Opt_ratio, Opt_flushoncommit, Opt_err,
+        Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 static match_table_t tokens = {
@@ -84,6 +83,8 @@ static match_table_t tokens = {
        {Opt_thread_pool, "thread_pool=%d"},
        {Opt_compress, "compress"},
        {Opt_ssd, "ssd"},
+        {Opt_ssd_spread, "ssd_spread"},
+        {Opt_nossd, "nossd"},
        {Opt_noacl, "noacl"},
        {Opt_notreelog, "notreelog"},
        {Opt_flushoncommit, "flushoncommit"},
@@ -158,7 +159,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                         */
                        break;
                case Opt_nodatasum:
-                        printk(KERN_INFO "btrfs: setting nodatacsum\n");
+                        printk(KERN_INFO "btrfs: setting nodatasum\n");
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
                case Opt_nodatacow:
@@ -174,6 +175,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
                        btrfs_set_opt(info->mount_opt, SSD);
                        break;
+                case Opt_ssd_spread:
+                        printk(KERN_INFO "btrfs: use spread ssd "
+                               "allocation scheme\n");
+                        btrfs_set_opt(info->mount_opt, SSD);
+                        btrfs_set_opt(info->mount_opt, SSD_SPREAD);
+                        break;
+                case Opt_nossd:
+                        printk(KERN_INFO "btrfs: not using ssd allocation "
+                               "scheme\n");
+                        btrfs_set_opt(info->mount_opt, NOSSD);
+                        btrfs_clear_opt(info->mount_opt, SSD);
+                        btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+                        break;
                case Opt_nobarrier:
                        printk(KERN_INFO "btrfs: turning off barriers\n");
                        btrfs_set_opt(info->mount_opt, NOBARRIER);
@@ -322,7 +336,7 @@ static int btrfs_fill_super(struct super_block *sb,
        struct dentry *root_dentry;
        struct btrfs_super_block *disk_super;
        struct btrfs_root *tree_root;
-        struct btrfs_inode *bi;
+        struct btrfs_key key;
        int err;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -341,23 +355,15 @@ static int btrfs_fill_super(struct super_block *sb,
        }
        sb->s_fs_info = tree_root;
        disk_super = &tree_root->fs_info->super_copy;
-        inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
-                                  tree_root->fs_info->fs_root);
-        bi = BTRFS_I(inode);
-        bi->location.objectid = inode->i_ino;
-        bi->location.offset = 0;
-        bi->root = tree_root->fs_info->fs_root;
-        btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
-        if (!inode) {
+        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
-                err = -ENOMEM;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+        if (IS_ERR(inode)) {
+                err = PTR_ERR(inode);
                goto fail_close;
        }
-        if (inode->i_state & I_NEW) {
-                btrfs_read_locked_inode(inode);
-                unlock_new_inode(inode);
-        }
        root_dentry = d_alloc_root(inode);
        if (!root_dentry) {
@@ -388,10 +394,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        struct btrfs_root *root = btrfs_sb(sb);
        int ret;
-        if (sb->s_flags & MS_RDONLY)
-                return 0;
-        sb->s_dirt = 0;
        if (!wait) {
                filemap_flush(root->fs_info->btree_inode->i_mapping);
                return 0;
@@ -402,7 +404,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
-        sb->s_dirt = 0;
        return ret;
 }
@@ -433,7 +434,11 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
        if (btrfs_test_opt(root, COMPRESS))
                seq_puts(seq, ",compress");
-        if (btrfs_test_opt(root, SSD))
+        if (btrfs_test_opt(root, NOSSD))
+                seq_puts(seq, ",nossd");
+        if (btrfs_test_opt(root, SSD_SPREAD))
+                seq_puts(seq, ",ssd_spread");
+        else if (btrfs_test_opt(root, SSD))
                seq_puts(seq, ",ssd");
        if (btrfs_test_opt(root, NOTREELOG))
                seq_puts(seq, ",notreelog");
@@ -444,11 +449,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        return 0;
 }
-static void btrfs_write_super(struct super_block *sb)
-{
-        sb->s_dirt = 0;
-}
 static int btrfs_test_super(struct super_block *s, void *data)
 {
        struct btrfs_fs_devices *test_fs_devices = data;
@@ -584,7 +584,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
                        return -EINVAL;
-                ret = btrfs_cleanup_reloc_trees(root);
+                /* recover relocation */
+                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
                ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -678,7 +679,6 @@ static int btrfs_unfreeze(struct super_block *sb)
 static struct super_operations btrfs_super_ops = {
        .delete_inode   = btrfs_delete_inode,
        .put_super      = btrfs_put_super,
-        .write_super    = btrfs_write_super,
        .sync_fs        = btrfs_sync_fs,
        .show_options   = btrfs_show_options,
        .write_inode    = btrfs_write_inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01b143605ec1..2e177d7f4bb9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,7 +25,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 #define BTRFS_ROOT_TRANS_TAG 0
@@ -94,45 +93,37 @@ static noinline int join_transaction(struct btrfs_root *root)
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
 */
-noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
+static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+                                         struct btrfs_root *root)
 {
-        struct btrfs_dirty_root *dirty;
+        if (root->ref_cows && root->last_trans < trans->transid) {
-        u64 running_trans_id = root->fs_info->running_transaction->transid;
-        if (root->ref_cows && root->last_trans < running_trans_id) {
                WARN_ON(root == root->fs_info->extent_root);
-                if (root->root_item.refs != 0) {
+                WARN_ON(root->root_item.refs == 0);
-                        radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                WARN_ON(root->commit_root != root->node);
-                                   (unsigned long)root->root_key.objectid,
-                                   BTRFS_ROOT_TRANS_TAG);
+                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+                           (unsigned long)root->root_key.objectid,
-                        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+                           BTRFS_ROOT_TRANS_TAG);
-                        BUG_ON(!dirty);
+                root->last_trans = trans->transid;
-                        dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+                btrfs_init_reloc_root(trans, root);
-                        BUG_ON(!dirty->root);
+        }
-                        dirty->latest_root = root;
+        return 0;
-                        INIT_LIST_HEAD(&dirty->list);
+}
-                        root->commit_root = btrfs_root_node(root);
-                        memcpy(dirty->root, root, sizeof(*root));
-                        spin_lock_init(&dirty->root->node_lock);
-                        spin_lock_init(&dirty->root->list_lock);
-                        mutex_init(&dirty->root->objectid_mutex);
-                        mutex_init(&dirty->root->log_mutex);
-                        INIT_LIST_HEAD(&dirty->root->dead_list);
-                        dirty->root->node = root->commit_root;
-                        dirty->root->commit_root = NULL;
-                        spin_lock(&root->list_lock);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
-                        list_add(&dirty->root->dead_list, &root->dead_list);
+                               struct btrfs_root *root)
-                        spin_unlock(&root->list_lock);
+{
+        if (!root->ref_cows)
+                return 0;
-                        root->dirty_root = dirty;
+        mutex_lock(&root->fs_info->trans_mutex);
-                } else {
+        if (root->last_trans == trans->transid) {
-                        WARN_ON(1);
+                mutex_unlock(&root->fs_info->trans_mutex);
-                }
+                return 0;
-                root->last_trans = running_trans_id;
        }
+        record_root_in_trans(trans, root);
+        mutex_unlock(&root->fs_info->trans_mutex);
        return 0;
 }
@@ -181,7 +172,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        ret = join_transaction(root);
        BUG_ON(ret);
-        btrfs_record_root_in_trans(root);
        h->transid = root->fs_info->running_transaction->transid;
        h->transaction = root->fs_info->running_transaction;
        h->blocks_reserved = num_blocks;
@@ -192,6 +182,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        h->delayed_ref_updates = 0;
        root->fs_info->running_transaction->use_count++;
+        record_root_in_trans(h, root);
        mutex_unlock(&root->fs_info->trans_mutex);
        return h;
 }
@@ -233,6 +224,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
        return 0;
 }
+#if 0
 /*
 * rate limit against the drop_snapshot code.  This helps to slow down new
 * operations if the drop_snapshot code isn't able to keep up.
@@ -273,6 +265,7 @@ harder:
                        goto harder;
        }
 }
+#endif
 void btrfs_throttle(struct btrfs_root *root)
 {
@@ -280,7 +273,6 @@ void btrfs_throttle(struct btrfs_root *root)
        if (!root->fs_info->open_ioctl_trans)
                wait_current_trans(root);
        mutex_unlock(&root->fs_info->trans_mutex);
-        throttle_on_drops(root);
 }
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -323,9 +315,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        memset(trans, 0, sizeof(*trans));
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
-        if (throttle)
-                throttle_on_drops(root);
        return 0;
 }
@@ -462,12 +451,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start)
                        break;
-                btrfs_set_root_bytenr(&root->root_item,
-                                       root->node->start);
-                btrfs_set_root_level(&root->root_item,
-                                     btrfs_header_level(root->node));
-                btrfs_set_root_generation(&root->root_item, trans->transid);
+                btrfs_set_root_node(&root->root_item, root->node);
                ret = btrfs_update_root(trans, tree_root,
                                        &root->root_key,
                                        &root->root_item);
@@ -477,14 +462,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
                BUG_ON(ret);
        }
+        free_extent_buffer(root->commit_root);
+        root->commit_root = btrfs_root_node(root);
        return 0;
 }
 /*
 * update all the cowonly tree roots on disk
 */
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root)
+                                         struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct list_head *next;
@@ -520,118 +507,54 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
+int btrfs_add_dead_root(struct btrfs_root *root)
 {
-        struct btrfs_dirty_root *dirty;
-        dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-        if (!dirty)
-                return -ENOMEM;
-        dirty->root = root;
-        dirty->latest_root = latest;
        mutex_lock(&root->fs_info->trans_mutex);
-        list_add(&dirty->list, &latest->fs_info->dead_roots);
+        list_add(&root->root_list, &root->fs_info->dead_roots);
        mutex_unlock(&root->fs_info->trans_mutex);
        return 0;
 }
 /*
- * at transaction commit time we need to schedule the old roots for
+ * update all the cowonly tree roots on disk
- * deletion via btrfs_drop_snapshot.  This runs through all the
- * reference counted roots that were modified in the current
- * transaction and puts them into the drop list
 */
-static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
-                                    struct radix_tree_root *radix,
+                                    struct btrfs_root *root)
-                                    struct list_head *list)
 {
-        struct btrfs_dirty_root *dirty;
        struct btrfs_root *gang[8];
-        struct btrfs_root *root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
        int i;
        int ret;
        int err = 0;
-        u32 refs;
        while (1) {
-                ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+                                                 (void **)gang, 0,
                                                 ARRAY_SIZE(gang),
                                                 BTRFS_ROOT_TRANS_TAG);
                if (ret == 0)
                        break;
                for (i = 0; i < ret; i++) {
                        root = gang[i];
-                        radix_tree_tag_clear(radix,
+                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
-                                     (unsigned long)root->root_key.objectid,
+                                        (unsigned long)root->root_key.objectid,
-                                     BTRFS_ROOT_TRANS_TAG);
+                                        BTRFS_ROOT_TRANS_TAG);
-                        BUG_ON(!root->ref_tree);
-                        dirty = root->dirty_root;
                        btrfs_free_log(trans, root);
-                        btrfs_free_reloc_root(trans, root);
+                        btrfs_update_reloc_root(trans, root);
-                        if (root->commit_root == root->node) {
-                                WARN_ON(root->node->start !=
-                                        btrfs_root_bytenr(&root->root_item));
-                                free_extent_buffer(root->commit_root);
-                                root->commit_root = NULL;
-                                root->dirty_root = NULL;
-                                spin_lock(&root->list_lock);
-                                list_del_init(&dirty->root->dead_list);
-                                spin_unlock(&root->list_lock);
-                                kfree(dirty->root);
+                        if (root->commit_root == root->node)
-                                kfree(dirty);
-                                /* make sure to update the root on disk
-                                 * so we get any updates to the block used
-                                 * counts
-                                 */
-                                err = btrfs_update_root(trans,
-                                                root->fs_info->tree_root,
-                                                &root->root_key,
-                                                &root->root_item);
                                continue;
-                        }
-                        memset(&root->root_item.drop_progress, 0,
+                        free_extent_buffer(root->commit_root);
-                               sizeof(struct btrfs_disk_key));
+                        root->commit_root = btrfs_root_node(root);
-                        root->root_item.drop_level = 0;
-                        root->commit_root = NULL;
+                        btrfs_set_root_node(&root->root_item, root->node);
-                        root->dirty_root = NULL;
+                        err = btrfs_update_root(trans, fs_info->tree_root,
-                        root->root_key.offset = root->fs_info->generation;
-                        btrfs_set_root_bytenr(&root->root_item,
-                                              root->node->start);
-                        btrfs_set_root_level(&root->root_item,
-                                             btrfs_header_level(root->node));
-                        btrfs_set_root_generation(&root->root_item,
-                                                  root->root_key.offset);
-                        err = btrfs_insert_root(trans, root->fs_info->tree_root,
                                                &root->root_key,
                                                &root->root_item);
                        if (err)
                                break;
-                        refs = btrfs_root_refs(&dirty->root->root_item);
-                        btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
-                        err = btrfs_update_root(trans, root->fs_info->tree_root,
-                                                &dirty->root->root_key,
-                                                &dirty->root->root_item);
-                        BUG_ON(err);
-                        if (refs == 1) {
-                                list_add(&dirty->list, list);
-                        } else {
-                                WARN_ON(1);
-                                free_extent_buffer(dirty->root->node);
-                                kfree(dirty->root);
-                                kfree(dirty);
-                        }
                }
        }
        return err;
@@ -688,12 +611,8 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
                                TASK_UNINTERRUPTIBLE);
                mutex_unlock(&info->trans_mutex);
-                atomic_dec(&info->throttles);
-                wake_up(&info->transaction_throttle);
                schedule();
-                atomic_inc(&info->throttles);
                mutex_lock(&info->trans_mutex);
                finish_wait(&info->transaction_wait, &wait);
        }
@@ -705,111 +624,61 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
 * all of them
 */
-static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
+int btrfs_drop_dead_root(struct btrfs_root *root)
-                                     struct list_head *list)
 {
-        struct btrfs_dirty_root *dirty;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
        unsigned long nr;
-        u64 num_bytes;
+        int ret;
-        u64 bytes_used;
-        u64 max_useless;
-        int ret = 0;
-        int err;
-        while (!list_empty(list)) {
-                struct btrfs_root *root;
-                dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
-                list_del_init(&dirty->list);
-                num_bytes = btrfs_root_used(&dirty->root->root_item);
-                root = dirty->latest_root;
-                atomic_inc(&root->fs_info->throttles);
-                while (1) {
-                        /*
-                         * we don't want to jump in and create a bunch of
-                         * delayed refs if the transaction is starting to close
-                         */
-                        wait_transaction_pre_flush(tree_root->fs_info);
-                        trans = btrfs_start_transaction(tree_root, 1);
-                        /*
-                         * we've joined a transaction, make sure it isn't
-                         * closing right now
-                         */
-                        if (trans->transaction->delayed_refs.flushing) {
-                                btrfs_end_transaction(trans, tree_root);
-                                continue;
-                        }
-                        mutex_lock(&root->fs_info->drop_mutex);
-                        ret = btrfs_drop_snapshot(trans, dirty->root);
-                        if (ret != -EAGAIN)
-                                break;
-                        mutex_unlock(&root->fs_info->drop_mutex);
-                        err = btrfs_update_root(trans,
+        while (1) {
-                                        tree_root,
+                /*
-                                        &dirty->root->root_key,
+                 * we don't want to jump in and create a bunch of
-                                        &dirty->root->root_item);
+                 * delayed refs if the transaction is starting to close
-                        if (err)
+                 */
-                                ret = err;
+                wait_transaction_pre_flush(tree_root->fs_info);
-                        nr = trans->blocks_used;
+                trans = btrfs_start_transaction(tree_root, 1);
-                        ret = btrfs_end_transaction(trans, tree_root);
-                        BUG_ON(ret);
-                        btrfs_btree_balance_dirty(tree_root, nr);
+                /*
-                        cond_resched();
+                 * we've joined a transaction, make sure it isn't
+                 * closing right now
+                 */
+                if (trans->transaction->delayed_refs.flushing) {
+                        btrfs_end_transaction(trans, tree_root);
+                        continue;
                }
-                BUG_ON(ret);
-                atomic_dec(&root->fs_info->throttles);
-                wake_up(&root->fs_info->transaction_throttle);
-                num_bytes -= btrfs_root_used(&dirty->root->root_item);
+                ret = btrfs_drop_snapshot(trans, root);
-                bytes_used = btrfs_root_used(&root->root_item);
+                if (ret != -EAGAIN)
-                if (num_bytes) {
+                        break;
-                        mutex_lock(&root->fs_info->trans_mutex);
-                        btrfs_record_root_in_trans(root);
-                        mutex_unlock(&root->fs_info->trans_mutex);
-                        btrfs_set_root_used(&root->root_item,
-                                            bytes_used - num_bytes);
-                }
-                ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
+                ret = btrfs_update_root(trans, tree_root,
-                if (ret) {
+                                        &root->root_key,
-                        BUG();
+                                        &root->root_item);
+                if (ret)
                        break;
-                }
-                mutex_unlock(&root->fs_info->drop_mutex);
-                spin_lock(&root->list_lock);
-                list_del_init(&dirty->root->dead_list);
-                if (!list_empty(&root->dead_list)) {
-                        struct btrfs_root *oldest;
-                        oldest = list_entry(root->dead_list.prev,
-                                            struct btrfs_root, dead_list);
-                        max_useless = oldest->root_key.offset - 1;
-                } else {
-                        max_useless = root->root_key.offset - 1;
-                }
-                spin_unlock(&root->list_lock);
                nr = trans->blocks_used;
                ret = btrfs_end_transaction(trans, tree_root);
                BUG_ON(ret);
-                ret = btrfs_remove_leaf_refs(root, max_useless, 0);
-                BUG_ON(ret);
-                free_extent_buffer(dirty->root->node);
-                kfree(dirty->root);
-                kfree(dirty);
                btrfs_btree_balance_dirty(tree_root, nr);
                cond_resched();
        }
+        BUG_ON(ret);
+        ret = btrfs_del_root(trans, tree_root, &root->root_key);
+        BUG_ON(ret);
+        nr = trans->blocks_used;
+        ret = btrfs_end_transaction(trans, tree_root);
+        BUG_ON(ret);
+        free_extent_buffer(root->node);
+        free_extent_buffer(root->commit_root);
+        kfree(root);
+        btrfs_btree_balance_dirty(tree_root, nr);
        return ret;
 }
@@ -839,24 +708,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (ret)
                goto fail;
-        btrfs_record_root_in_trans(root);
+        record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
        key.objectid = objectid;
-        key.offset = trans->transid;
+        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        old = btrfs_lock_root_node(root);
        btrfs_cow_block(trans, root, old, NULL, 0, &old);
+        btrfs_set_lock_blocking(old);
        btrfs_copy_root(trans, root, old, &tmp, objectid);
        btrfs_tree_unlock(old);
        free_extent_buffer(old);
-        btrfs_set_root_bytenr(new_root_item, tmp->start);
+        btrfs_set_root_node(new_root_item, tmp);
-        btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
-        btrfs_set_root_generation(new_root_item, trans->transid);
        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                new_root_item);
        btrfs_tree_unlock(tmp);
@@ -964,6 +832,24 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
        return 0;
 }
+static void update_super_roots(struct btrfs_root *root)
+{
+        struct btrfs_root_item *root_item;
+        struct btrfs_super_block *super;
+        super = &root->fs_info->super_copy;
+        root_item = &root->fs_info->chunk_root->root_item;
+        super->chunk_root = root_item->bytenr;
+        super->chunk_root_generation = root_item->generation;
+        super->chunk_root_level = root_item->level;
+        root_item = &root->fs_info->tree_root->root_item;
+        super->root = root_item->bytenr;
+        super->generation = root_item->generation;
+        super->root_level = root_item->level;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -971,8 +857,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        unsigned long timeout = 1;
        struct btrfs_transaction *cur_trans;
        struct btrfs_transaction *prev_trans = NULL;
-        struct btrfs_root *chunk_root = root->fs_info->chunk_root;
-        struct list_head dirty_fs_roots;
        struct extent_io_tree *pinned_copy;
        DEFINE_WAIT(wait);
        int ret;
@@ -999,7 +883,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        mutex_lock(&root->fs_info->trans_mutex);
-        INIT_LIST_HEAD(&dirty_fs_roots);
        if (cur_trans->in_commit) {
                cur_trans->use_count++;
                mutex_unlock(&root->fs_info->trans_mutex);
@@ -1105,41 +988,36 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         * with the tree-log code.
         */
        mutex_lock(&root->fs_info->tree_log_mutex);
-        /*
-         * keep tree reloc code from adding new reloc trees
-         */
-        mutex_lock(&root->fs_info->tree_reloc_mutex);
-        ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
+        ret = commit_fs_roots(trans, root);
-                              &dirty_fs_roots);
        BUG_ON(ret);
-        /* add_dirty_roots gets rid of all the tree log roots, it is now
+        /* commit_fs_roots gets rid of all the tree log roots, it is now
         * safe to free the root of tree log roots
         */
        btrfs_free_log_root_tree(trans, root->fs_info);
-        ret = btrfs_commit_tree_roots(trans, root);
+        ret = commit_cowonly_roots(trans, root);
        BUG_ON(ret);
        cur_trans = root->fs_info->running_transaction;
        spin_lock(&root->fs_info->new_trans_lock);
        root->fs_info->running_transaction = NULL;
        spin_unlock(&root->fs_info->new_trans_lock);
-        btrfs_set_super_generation(&root->fs_info->super_copy,
-                                   cur_trans->transid);
+        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
-        btrfs_set_super_root(&root->fs_info->super_copy,
+                            root->fs_info->tree_root->node);
-                             root->fs_info->tree_root->node->start);
+        free_extent_buffer(root->fs_info->tree_root->commit_root);
-        btrfs_set_super_root_level(&root->fs_info->super_copy,
+        root->fs_info->tree_root->commit_root =
-                           btrfs_header_level(root->fs_info->tree_root->node));
+                                btrfs_root_node(root->fs_info->tree_root);
-        btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+        btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
-                                   chunk_root->node->start);
+                            root->fs_info->chunk_root->node);
-        btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+        free_extent_buffer(root->fs_info->chunk_root->commit_root);
-                                         btrfs_header_level(chunk_root->node));
+        root->fs_info->chunk_root->commit_root =
-        btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
+                                btrfs_root_node(root->fs_info->chunk_root);
-                                btrfs_header_generation(chunk_root->node));
+        update_super_roots(root);
        if (!root->fs_info->log_root_recovering) {
                btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
@@ -1153,7 +1031,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        trans->transaction->blocked = 0;
-        wake_up(&root->fs_info->transaction_throttle);
        wake_up(&root->fs_info->transaction_wait);
        mutex_unlock(&root->fs_info->trans_mutex);
@@ -1170,9 +1047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root, pinned_copy);
        kfree(pinned_copy);
-        btrfs_drop_dead_reloc_roots(root);
-        mutex_unlock(&root->fs_info->tree_reloc_mutex);
        /* do the directory inserts of any pending snapshot creations */
        finish_pending_snapshots(trans, root->fs_info);
@@ -1186,16 +1060,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        put_transaction(cur_trans);
        put_transaction(cur_trans);
-        list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
-        if (root->fs_info->closing)
-                list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
        mutex_unlock(&root->fs_info->trans_mutex);
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
-        if (root->fs_info->closing)
-                drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
        return ret;
 }
@@ -1204,16 +1071,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 */
 int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
-        struct list_head dirty_roots;
+        LIST_HEAD(list);
-        INIT_LIST_HEAD(&dirty_roots);
+        struct btrfs_fs_info *fs_info = root->fs_info;
-again:
-        mutex_lock(&root->fs_info->trans_mutex);
+        mutex_lock(&fs_info->trans_mutex);
-        list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+        list_splice_init(&fs_info->dead_roots, &list);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        mutex_unlock(&fs_info->trans_mutex);
-        if (!list_empty(&dirty_roots)) {
+        while (!list_empty(&list)) {
-                drop_dirty_roots(root, &dirty_roots);
+                root = list_entry(list.next, struct btrfs_root, root_list);
-                goto again;
+                list_del_init(&root->root_list);
+                btrfs_drop_dead_root(root);
        }
        return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 94f5bde2b58d..961c3ee5a2e1 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,12 +62,6 @@ struct btrfs_pending_snapshot {
        struct list_head list;
 };
-struct btrfs_dirty_root {
-        struct list_head list;
-        struct btrfs_root *root;
-        struct btrfs_root *latest_root;
-};
 static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
                                               struct inode *inode)
 {
@@ -100,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root);
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
+int btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_drop_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -108,7 +103,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
-int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
                                        struct extent_io_tree *dirty_pages);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index db5e212e8445..c13922206d1b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -430,18 +430,16 @@ no_copy:
 static noinline struct inode *read_one_inode(struct btrfs_root *root,
                                             u64 objectid)
 {
+        struct btrfs_key key;
        struct inode *inode;
-        inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
-        if (inode->i_state & I_NEW) {
-                BTRFS_I(inode)->root = root;
-                BTRFS_I(inode)->location.objectid = objectid;
-                BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-                BTRFS_I(inode)->location.offset = 0;
-                btrfs_read_locked_inode(inode);
-                unlock_new_inode(inode);
-        }
+        key.objectid = objectid;
-        if (is_bad_inode(inode)) {
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(root->fs_info->sb, &key, root);
+        if (IS_ERR(inode)) {
+                inode = NULL;
+        } else if (is_bad_inode(inode)) {
                iput(inode);
                inode = NULL;
        }
@@ -541,6 +539,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                u64 offset;
                unsigned long dest_offset;
                struct btrfs_key ins;
@@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
                ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
                ins.type = BTRFS_EXTENT_ITEM_KEY;
+                offset = key->offset - btrfs_file_extent_offset(eb, item);
                if (ins.objectid > 0) {
                        u64 csum_start;
@@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                        if (ret == 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
                                                ins.objectid, ins.offset,
-                                                path->nodes[0]->start,
+                                                0, root->root_key.objectid,
-                                                root->root_key.objectid,
+                                                key->objectid, offset);
-                                                trans->transid, key->objectid);
                        } else {
                                /*
                                 * insert the extent pointer in the extent
                                 * allocation tree
                                 */
-                                ret = btrfs_alloc_logged_extent(trans, root,
+                                ret = btrfs_alloc_logged_file_extent(trans,
-                                                path->nodes[0]->start,
+                                                root, root->root_key.objectid,
-                                                root->root_key.objectid,
+                                                key->objectid, offset, &ins);
-                                                trans->transid, key->objectid,
-                                                &ins);
                                BUG_ON(ret);
                        }
                        btrfs_release_path(root, path);
@@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
-                                ret = btrfs_drop_leaf_ref(trans, root, next);
-                                BUG_ON(ret);
                                WARN_ON(root_owner !=
                                        BTRFS_TREE_LOG_OBJECTID);
                                ret = btrfs_free_reserved_extent(root,
@@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                btrfs_wait_tree_block_writeback(next);
                btrfs_tree_unlock(next);
-                if (*level == 0) {
-                        ret = btrfs_drop_leaf_ref(trans, root, next);
-                        BUG_ON(ret);
-                }
                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
                ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
                BUG_ON(ret);
@@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
-                                if (*level == 0) {
-                                        ret = btrfs_drop_leaf_ref(trans, root,
-                                                                  next);
-                                        BUG_ON(ret);
-                                }
                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
                                ret = btrfs_free_reserved_extent(root,
                                                path->nodes[*level]->start,
@@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
                        btrfs_wait_tree_block_writeback(next);
                        btrfs_tree_unlock(next);
-                        if (orig_level == 0) {
-                                ret = btrfs_drop_leaf_ref(trans, log,
-                                                          next);
-                                BUG_ON(ret);
-                        }
                        WARN_ON(log->root_key.objectid !=
                                BTRFS_TREE_LOG_OBJECTID);
                        ret = btrfs_free_reserved_extent(log, next->start,
@@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
        BUG_ON(ret);
-        btrfs_set_root_bytenr(&log->root_item, log->node->start);
+        btrfs_set_root_node(&log->root_item, log->node);
-        btrfs_set_root_generation(&log->root_item, trans->transid);
-        btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
        root->log_batch = 0;
        root->log_transid++;
@@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                       ins_keys, ins_sizes, nr);
        BUG_ON(ret);
-        for (i = 0; i < nr; i++) {
+        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
                                                   dst_path->slots[0]);
@@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                        found_type = btrfs_file_extent_type(src, extent);
                        if (found_type == BTRFS_FILE_EXTENT_REG ||
                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-                                u64 ds = btrfs_file_extent_disk_bytenr(src,
+                                u64 ds, dl, cs, cl;
-                                                                   extent);
+                                ds = btrfs_file_extent_disk_bytenr(src,
-                                u64 dl = btrfs_file_extent_disk_num_bytes(src,
+                                                                extent);
-                                                                      extent);
+                                /* ds == 0 is a hole */
-                                u64 cs = btrfs_file_extent_offset(src, extent);
+                                if (ds == 0)
-                                u64 cl = btrfs_file_extent_num_bytes(src,
+                                        continue;
-                                                                     extent);;
+                                dl = btrfs_file_extent_disk_num_bytes(src,
+                                                                extent);
+                                cs = btrfs_file_extent_offset(src, extent);
+                                cl = btrfs_file_extent_num_bytes(src,
+                                                                extent);;
                                if (btrfs_file_extent_compression(src,
                                                                  extent)) {
                                        cs = 0;
                                        cl = dl;
                                }
-                                /* ds == 0 is a hole */
-                                if (ds != 0) {
+                                ret = btrfs_lookup_csums_range(
-                                        ret = btrfs_inc_extent_ref(trans, log,
+                                                log->fs_info->csum_root,
-                                                   ds, dl,
+                                                ds + cs, ds + cs + cl - 1,
-                                                   dst_path->nodes[0]->start,
+                                                &ordered_sums);
-                                                   BTRFS_TREE_LOG_OBJECTID,
+                                BUG_ON(ret);
-                                                   trans->transid,
-                                                   ins_keys[i].objectid);
-                                        BUG_ON(ret);
-                                        ret = btrfs_lookup_csums_range(
-                                                   log->fs_info->csum_root,
-                                                   ds + cs, ds + cs + cl - 1,
-                                                   &ordered_sums);
-                                        BUG_ON(ret);
-                                }
                        }
                }
-                dst_path->slots[0]++;
        }
        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
@@ -3029,9 +3001,7 @@ again:
                BUG_ON(!wc.replay_dest);
                wc.replay_dest->log_root = log;
-                mutex_lock(&fs_info->trans_mutex);
+                btrfs_record_root_in_trans(trans, wc.replay_dest);
-                btrfs_record_root_in_trans(wc.replay_dest);
-                mutex_unlock(&fs_info->trans_mutex);
                ret = walk_log_tree(trans, log, &wc);
                BUG_ON(ret);
@@ -3049,6 +3019,7 @@ again:
                key.offset = found_key.offset - 1;
                wc.replay_dest->log_root = NULL;
                free_extent_buffer(log->node);
+                free_extent_buffer(log->commit_root);
                kfree(log);
                if (found_key.offset == 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a6d35b0054ca..3ab80e9cd767 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -161,8 +161,10 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        int again = 0;
        unsigned long num_run;
        unsigned long num_sync_run;
+        unsigned long batch_run = 0;
        unsigned long limit;
        unsigned long last_waited = 0;
+        int force_reg = 0;
        bdi = blk_get_backing_dev_info(device->bdev);
        fs_info = device->dev_root->fs_info;
@@ -176,19 +178,22 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 loop:
        spin_lock(&device->io_lock);
-        num_run = 0;
 loop_lock:
+        num_run = 0;
        /* take all the bios off the list at once and process them
         * later on (without the lock held).  But, remember the
         * tail and other pointers so the bios can be properly reinserted
         * into the list if we hit congestion
         */
-        if (device->pending_sync_bios.head)
+        if (!force_reg && device->pending_sync_bios.head) {
                pending_bios = &device->pending_sync_bios;
-        else
+                force_reg = 1;
+        } else {
                pending_bios = &device->pending_bios;
+                force_reg = 0;
+        }
        pending = pending_bios->head;
        tail = pending_bios->tail;
@@ -228,10 +233,14 @@ loop_lock:
        while (pending) {
                rmb();
-                if (pending_bios != &device->pending_sync_bios &&
+                /* we want to work on both lists, but do more bios on the
-                    device->pending_sync_bios.head &&
+                 * sync list than the regular list
-                    num_run > 16) {
+                 */
-                        cond_resched();
+                if ((num_run > 32 &&
+                    pending_bios != &device->pending_sync_bios &&
+                    device->pending_sync_bios.head) ||
+                   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+                    device->pending_bios.head)) {
                        spin_lock(&device->io_lock);
                        requeue_list(pending_bios, pending, tail);
                        goto loop_lock;
@@ -249,6 +258,8 @@ loop_lock:
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
                submit_bio(cur->bi_rw, cur);
                num_run++;
+                batch_run++;
                if (bio_sync(cur))
                        num_sync_run++;
@@ -265,7 +276,7 @@ loop_lock:
                 * is now congested.  Back off and let other work structs
                 * run instead
                 */
-                if (pending && bdi_write_congested(bdi) && num_run > 16 &&
+                if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
                    fs_info->fs_devices->open_devices > 1) {
                        struct io_context *ioc;
@@ -366,6 +377,7 @@ static noinline int device_list_add(const char *path,
                memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
                fs_devices->latest_devid = devid;
                fs_devices->latest_trans = found_transid;
+                mutex_init(&fs_devices->device_list_mutex);
                device = NULL;
        } else {
                device = __find_device(&fs_devices->devices, devid,
@@ -392,7 +404,11 @@ static noinline int device_list_add(const char *path,
                        return -ENOMEM;
                }
                INIT_LIST_HEAD(&device->dev_alloc_list);
+                mutex_lock(&fs_devices->device_list_mutex);
                list_add(&device->dev_list, &fs_devices->devices);
+                mutex_unlock(&fs_devices->device_list_mutex);
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
@@ -418,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
        INIT_LIST_HEAD(&fs_devices->devices);
        INIT_LIST_HEAD(&fs_devices->alloc_list);
        INIT_LIST_HEAD(&fs_devices->list);
+        mutex_init(&fs_devices->device_list_mutex);
        fs_devices->latest_devid = orig->latest_devid;
        fs_devices->latest_trans = orig->latest_trans;
        memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+        mutex_lock(&orig->device_list_mutex);
        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
                device = kzalloc(sizeof(*device), GFP_NOFS);
                if (!device)
@@ -443,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
+        mutex_unlock(&orig->device_list_mutex);
        return fs_devices;
 error:
+        mutex_unlock(&orig->device_list_mutex);
        free_fs_devices(fs_devices);
        return ERR_PTR(-ENOMEM);
 }
@@ -455,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
        mutex_lock(&uuid_mutex);
 again:
+        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                if (device->in_fs_metadata)
                        continue;
@@ -474,6 +495,7 @@ again:
                kfree(device->name);
                kfree(device);
        }
+        mutex_unlock(&fs_devices->device_list_mutex);
        if (fs_devices->seed) {
                fs_devices = fs_devices->seed;
@@ -594,6 +616,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                device->in_fs_metadata = 0;
                device->mode = flags;
+                if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+                        fs_devices->rotating = 1;
                fs_devices->open_devices++;
                if (device->writeable) {
                        fs_devices->rw_devices++;
@@ -1121,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                device = NULL;
                devices = &root->fs_info->fs_devices->devices;
+                mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
                list_for_each_entry(tmp, devices, dev_list) {
                        if (tmp->in_fs_metadata && !tmp->bdev) {
                                device = tmp;
                                break;
                        }
                }
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
                bdev = NULL;
                bh = NULL;
                disk_super = NULL;
@@ -1181,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                goto error_brelse;
        device->in_fs_metadata = 0;
+        /*
+         * the device list mutex makes sure that we don't change
+         * the device list while someone else is writing out all
+         * the device supers.
+         */
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        list_del_init(&device->dev_list);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        device->fs_devices->num_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
@@ -1275,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
        seed_devices->opened = 1;
        INIT_LIST_HEAD(&seed_devices->devices);
        INIT_LIST_HEAD(&seed_devices->alloc_list);
+        mutex_init(&seed_devices->device_list_mutex);
        list_splice_init(&fs_devices->devices, &seed_devices->devices);
        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
        list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1400,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        mutex_lock(&root->fs_info->volume_mutex);
        devices = &root->fs_info->fs_devices->devices;
+        /*
+         * we have the volume lock, so we don't need the extra
+         * device list mutex while reading the list here.
+         */
        list_for_each_entry(device, devices, dev_list) {
                if (device->bdev == bdev) {
                        ret = -EEXIST;
@@ -1454,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        }
        device->fs_devices = root->fs_info->fs_devices;
+        /*
+         * we don't want write_supers to jump in here with our device
+         * half setup
+         */
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
        list_add(&device->dev_alloc_list,
                 &root->fs_info->fs_devices->alloc_list);
@@ -1462,6 +1509,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        root->fs_info->fs_devices->rw_devices++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+                root->fs_info->fs_devices->rotating = 1;
        total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
        btrfs_set_super_total_bytes(&root->fs_info->super_copy,
                                    total_bytes + device->total_bytes);
@@ -1469,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
        btrfs_set_super_num_devices(&root->fs_info->super_copy,
                                    total_bytes + 1);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        if (seeding_dev) {
                ret = init_first_rw_device(trans, root, device);
@@ -1671,8 +1722,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        int ret;
        int i;
-        printk(KERN_INFO "btrfs relocating chunk %llu\n",
-               (unsigned long long)chunk_offset);
        root = root->fs_info->chunk_root;
        extent_root = root->fs_info->extent_root;
        em_tree = &root->fs_info->mapping_tree.map_tree;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5c3ff6d02fd7..5139a833f721 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -96,7 +96,12 @@ struct btrfs_fs_devices {
        u64 rw_devices;
        u64 total_rw_bytes;
        struct block_device *latest_bdev;
-        /* all of the devices in the FS */
+        /* all of the devices in the FS, protected by a mutex
+         * so we can safely walk it to write out the supers without
+         * worrying about add/remove by the multi-device code
+         */
+        struct mutex device_list_mutex;
        struct list_head devices;
        /* devices not currently being allocated */
@@ -107,6 +112,11 @@ struct btrfs_fs_devices {
        int seeding;
        int opened;
+        /* set when we find or add a device that doesn't have the
+         * nonrot flag set
+         */
+        int rotating;
 };
 struct btrfs_bio_stripe {
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1e962348d111..431accd475a7 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,9 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
        /* make sure all pages pinned by operations on behalf of the netfs are
         * written to disc */
        cachefiles_begin_secure(cache, &saved_cred);
-        ret = fsync_super(cache->mnt->mnt_sb);
+        down_read(&cache->mnt->mnt_sb->s_umount);
+        ret = sync_filesystem(cache->mnt->mnt_sb);
+        up_read(&cache->mnt->mnt_sb->s_umount);
        cachefiles_end_secure(cache, saved_cred);
        if (ret == -EIO)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 38f71222a552..b7c9d5187a75 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -375,7 +375,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
-                        inode->i_cindex = idx;
                        list_add(&inode->i_devices, &p->list);
                        new = NULL;
                } else if (!cdev_get(p))
@@ -405,6 +404,18 @@ static int chrdev_open(struct inode *inode, struct file *filp)
        return ret;
 }
+int cdev_index(struct inode *inode)
+{
+        int idx;
+        struct kobject *kobj;
+        kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
+        if (!kobj)
+                return -1;
+        kobject_put(kobj);
+        return idx;
+}
 void cd_forget(struct inode *inode)
 {
        spin_lock(&cdev_lock);
@@ -557,6 +568,7 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(register_chrdev);
 EXPORT_SYMBOL(unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 83d62759c7c7..3bb11be8b6a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -275,7 +275,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
        case -EBUSY:
                /* someone else made a mount here whilst we were busy */
                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path.mnt, &nd->path.dentry))
+                       follow_down(&nd->path))
                        ;
                err = 0;
        default:
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a10a59b6392..0d92114195ab 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -204,6 +204,9 @@ cifs_put_super(struct super_block *sb)
                cFYI(1, ("Empty cifs superblock info passed to unmount"));
                return;
        }
+        lock_kernel();
        rc = cifs_umount(sb, cifs_sb);
        if (rc)
                cERROR(1, ("cifs_umount failed with return code %d", rc));
@@ -216,7 +219,8 @@ cifs_put_super(struct super_block *sb)
        unload_nls(cifs_sb->local_nls);
        kfree(cifs_sb);
-        return;
+        unlock_kernel();
 }
 static int
diff --git a/fs/compat.c b/fs/compat.c
index bb2a9b2e8173..6aefb776dfeb 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -812,10 +812,8 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
                }
        }
-        lock_kernel();
        retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
                        flags, (void*)data_page);
-        unlock_kernel();
 out4:
        free_page(data_page);
diff --git a/fs/dcache.c b/fs/dcache.c
index 75659a6fd1f8..9e5cd3c3a6ba 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root,
        spin_lock(&vfsmount_lock);
        prepend(&end, &buflen, "\0", 1);
-        if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+        if (d_unlinked(dentry) &&
                (prepend(&end, &buflen, " (deleted)", 10) != 0))
                        goto Elong;
@@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
        spin_lock(&dcache_lock);
        prepend(&end, &buflen, "\0", 1);
-        if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+        if (d_unlinked(dentry) &&
                (prepend(&end, &buflen, "//deleted", 9) != 0))
                        goto Elong;
        if (buflen < 1)
@@ -2097,9 +2097,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        read_unlock(&current->fs->lock);
        error = -ENOENT;
-        /* Has the current directory has been unlinked? */
        spin_lock(&dcache_lock);
-        if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
+        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
                char * cwd;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index fa4c7e7d15d9..12d649602d3a 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/key.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
@@ -120,9 +121,13 @@ static void ecryptfs_put_super(struct super_block *sb)
 {
        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+        lock_kernel();
        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
        ecryptfs_set_superblock_private(sb, NULL);
+        unlock_kernel();
 }
 /**
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f1985e857e2..8216c5b77b53 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -200,20 +200,21 @@ static const struct export_operations exofs_export_ops;
 /*
 * Write the superblock to the OSD
 */
-static void exofs_write_super(struct super_block *sb)
+static int exofs_sync_fs(struct super_block *sb, int wait)
 {
        struct exofs_sb_info *sbi;
        struct exofs_fscb *fscb;
        struct osd_request *or;
        struct osd_obj_id obj;
-        int ret;
+        int ret = -ENOMEM;
        fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
        if (!fscb) {
                EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-                return;
+                return -ENOMEM;
        }
+        lock_super(sb);
        lock_kernel();
        sbi = sb->s_fs_info;
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -246,7 +247,17 @@ out:
        if (or)
                osd_end_request(or);
        unlock_kernel();
+        unlock_super(sb);
        kfree(fscb);
+        return ret;
+}
+static void exofs_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                exofs_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 /*
@@ -258,6 +269,11 @@ static void exofs_put_super(struct super_block *sb)
        int num_pend;
        struct exofs_sb_info *sbi = sb->s_fs_info;
+        lock_kernel();
+        if (sb->s_dirt)
+                exofs_write_super(sb);
        /* make sure there are no pending commands */
        for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
             num_pend = atomic_read(&sbi->s_curr_pending)) {
@@ -271,6 +287,8 @@ static void exofs_put_super(struct super_block *sb)
        osduld_put_device(sbi->s_dev);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 /*
@@ -484,6 +502,7 @@ static const struct super_operations exofs_sops = {
        .delete_inode   = exofs_delete_inode,
        .put_super      = exofs_put_super,
        .write_super    = exofs_write_super,
+        .sync_fs        = exofs_sync_fs,
        .statfs         = exofs_statfs,
 };
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index e0b2b43c1fdb..f42af45cfd88 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_EXT2_FS) += ext2.o
-ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
          ioctl.o namei.o super.o symlink.o
 ext2-$(CONFIG_EXT2_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2999d72153b7..003500498c22 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -720,5 +720,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-        .fsync          = ext2_sync_file,
+        .fsync          = simple_fsync,
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3203042b36ef..b2bbf45039e0 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -113,9 +113,6 @@ extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
-/* fsync.c */
-extern int ext2_sync_file (struct file *, struct dentry *, int);
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, int);
 extern void ext2_free_inode (struct inode *);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 45ed07122182..2b9e47dc9222 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = {
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,
        .release        = ext2_release_file,
-        .fsync          = ext2_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
 };
@@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = {
        .mmap           = xip_file_mmap,
        .open           = generic_file_open,
        .release        = ext2_release_file,
-        .fsync          = ext2_sync_file,
+        .fsync          = simple_fsync,
 };
 #endif
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
deleted file mode 100644
index fc66c93fcb5c..000000000000
--- a/fs/ext2/fsync.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  linux/fs/ext2/fsync.c
- *
- *  Copyright (C) 1993  Stephen Tweedie (sct@dcs.ed.ac.uk)
- *  from
- *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
- *                      Laboratoire MASI - Institut Blaise Pascal
- *                      Universite Pierre et Marie Curie (Paris VI)
- *  from
- *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
- * 
- *  ext2fs fsync primitive
- *
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- * 
- *  Removed unnecessary code duplication for little endian machines
- *  and excessive __inline__s. 
- *        Andi Kleen, 1997
- *
- * Major simplications and cleanup - we only need to do the metadata, because
- * we can depend on generic_block_fdatasync() to sync the data blocks.
- */
-#include "ext2.h"
-#include <linux/buffer_head.h>          /* for sync_mapping_buffers() */
-/*
- *      File may be NULL when we are called. Perhaps we shouldn't
- *      even pass file to fsync ?
- */
-int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        int ret;
-        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return ret;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return ret;
-        err = ext2_sync_inode(inode);
-        if (ret == 0)
-                ret = err;
-        return ret;
-}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index acf678831103..29ed682061f6 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,8 +41,6 @@ MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
-static int ext2_update_inode(struct inode * inode, int do_sync);
 /*
 * Test whether an inode is a fast symlink.
 */
@@ -66,7 +64,7 @@ void ext2_delete_inode (struct inode * inode)
                goto no_delete;
        EXT2_I(inode)->i_dtime  = get_seconds();
        mark_inode_dirty(inode);
-        ext2_update_inode(inode, inode_needs_sync(inode));
+        ext2_write_inode(inode, inode_needs_sync(inode));
        inode->i_size = 0;
        if (inode->i_blocks)
@@ -1337,7 +1335,7 @@ bad_inode:
        return ERR_PTR(ret);
 }
-static int ext2_update_inode(struct inode * inode, int do_sync)
+int ext2_write_inode(struct inode *inode, int do_sync)
 {
        struct ext2_inode_info *ei = EXT2_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -1442,11 +1440,6 @@ static int ext2_update_inode(struct inode * inode, int do_sync)
        return err;
 }
-int ext2_write_inode(struct inode *inode, int wait)
-{
-        return ext2_update_inode(inode, wait);
-}
 int ext2_sync_inode(struct inode *inode)
 {
        struct writeback_control wbc = {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e3c748faf2db..458999638c3d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,7 @@ static void ext2_sync_super(struct super_block *sb,
                            struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext2_sync_fs(struct super_block *sb, int wait);
 void ext2_error (struct super_block * sb, const char * function,
                 const char * fmt, ...)
@@ -114,6 +115,11 @@ static void ext2_put_super (struct super_block * sb)
        int i;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                ext2_write_super(sb);
        ext2_xattr_put_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                struct ext2_super_block *es = sbi->s_es;
@@ -135,7 +141,7 @@ static void ext2_put_super (struct super_block * sb)
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache * ext2_inode_cachep;
@@ -304,6 +310,7 @@ static const struct super_operations ext2_sops = {
        .delete_inode   = ext2_delete_inode,
        .put_super      = ext2_put_super,
        .write_super    = ext2_write_super,
+        .sync_fs        = ext2_sync_fs,
        .statfs         = ext2_statfs,
        .remount_fs     = ext2_remount,
        .clear_inode    = ext2_clear_inode,
@@ -1127,25 +1134,36 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 * set s_state to EXT2_VALID_FS after some corrections.
 */
-void ext2_write_super (struct super_block * sb)
+static int ext2_sync_fs(struct super_block *sb, int wait)
 {
-        struct ext2_super_block * es;
+        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
        lock_kernel();
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
-                es = EXT2_SB(sb)->s_es;
+                ext2_debug("setting valid to 0\n");
+                es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-                if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
+                es->s_free_blocks_count =
-                        ext2_debug ("setting valid to 0\n");
+                        cpu_to_le32(ext2_count_free_blocks(sb));
-                        es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
+                es->s_free_inodes_count =
-                        es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
+                        cpu_to_le32(ext2_count_free_inodes(sb));
-                        es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
+                es->s_mtime = cpu_to_le32(get_seconds());
-                        es->s_mtime = cpu_to_le32(get_seconds());
+                ext2_sync_super(sb, es);
-                        ext2_sync_super(sb, es);
+        } else {
-                } else
+                ext2_commit_super(sb, es);
-                        ext2_commit_super (sb, es);
        }
        sb->s_dirt = 0;
        unlock_kernel();
+        return 0;
+}
+void ext2_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                ext2_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static int ext2_remount (struct super_block * sb, int * flags, char * data)
@@ -1157,6 +1175,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        unsigned long old_sb_flags;
        int err;
+        lock_kernel();
        /* Store the old options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -1192,12 +1212,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+                unlock_kernel();
                return 0;
+        }
        if (*flags & MS_RDONLY) {
                if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
-                    !(sbi->s_mount_state & EXT2_VALID_FS))
+                    !(sbi->s_mount_state & EXT2_VALID_FS)) {
+                        unlock_kernel();
                        return 0;
+                }
                /*
                 * OK, we are remounting a valid rw partition rdonly, so set
                 * the rdonly flag and then mark the partition as valid again.
@@ -1224,12 +1248,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                        sb->s_flags &= ~MS_RDONLY;
        }
        ext2_sync_super(sb, es);
+        unlock_kernel();
        return 0;
 restore_opts:
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sb->s_flags = old_sb_flags;
+        unlock_kernel();
        return err;
 }
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 225202db8974..27967f92e820 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -649,7 +649,7 @@ do_more:
                count = overflow;
                goto do_more;
        }
-        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
        ext3_std_error(sb, err);
@@ -1708,7 +1708,6 @@ allocated:
        if (!fatal)
                fatal = err;
-        sb->s_dirt = 1;
        if (fatal)
                goto out;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dd13d60d524b..b39991285136 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
        err = ext3_journal_dirty_metadata(handle, bitmap_bh);
        if (!fatal)
                fatal = err;
-        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
        ext3_std_error(sb, fatal);
@@ -537,7 +537,6 @@ got:
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);
-        sb->s_dirt = 1;
        inode->i_uid = current_fsuid();
        if (test_opt (sb, GRPID))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index fcfa24361856..b0248c6d5d4c 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2960,7 +2960,6 @@ static int ext3_do_update_inode(handle_t *handle,
                                ext3_update_dynamic_rev(sb);
                                EXT3_SET_RO_COMPAT_FEATURE(sb,
                                        EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
-                                sb->s_dirt = 1;
                                handle->h_sync = 1;
                                err = ext3_journal_dirty_metadata(handle,
                                                EXT3_SB(sb)->s_sbh);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 78fdf3836370..8a0b26340b54 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -934,7 +934,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
                           EXT3_INODES_PER_GROUP(sb));
        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
-        sb->s_dirt = 1;
 exit_journal:
        unlock_super(sb);
@@ -1066,7 +1065,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-        sb->s_dirt = 1;
        unlock_super(sb);
        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
                   o_blocks_count + add);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c70d52afb10..26aa64dee6aa 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -67,7 +67,6 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext3_unfreeze(struct super_block *sb);
-static void ext3_write_super (struct super_block * sb);
 static int ext3_freeze(struct super_block *sb);
 /*
@@ -399,6 +398,8 @@ static void ext3_put_super (struct super_block * sb)
        struct ext3_super_block *es = sbi->s_es;
        int i, err;
+        lock_kernel();
        ext3_xattr_put_super(sb);
        err = journal_destroy(sbi->s_journal);
        sbi->s_journal = NULL;
@@ -447,7 +448,8 @@ static void ext3_put_super (struct super_block * sb)
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache *ext3_inode_cachep;
@@ -761,7 +763,6 @@ static const struct super_operations ext3_sops = {
        .dirty_inode    = ext3_dirty_inode,
        .delete_inode   = ext3_delete_inode,
        .put_super      = ext3_put_super,
-        .write_super    = ext3_write_super,
        .sync_fs        = ext3_sync_fs,
        .freeze_fs      = ext3_freeze,
        .unfreeze_fs    = ext3_unfreeze,
@@ -1785,7 +1786,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 #else
                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
-                sb->s_dirt = 1;
        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
@@ -2265,7 +2265,6 @@ static int ext3_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
-                sb->s_dirt = 1;
                /* Make sure we flush the recovery flag to disk. */
                ext3_commit_super(sb, es, 1);
@@ -2308,7 +2307,6 @@ static int ext3_create_journal(struct super_block * sb,
        EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
        es->s_journal_inum = cpu_to_le32(journal_inum);
-        sb->s_dirt = 1;
        /* Make sure we flush the recovery flag to disk. */
        ext3_commit_super(sb, es, 1);
@@ -2354,7 +2352,6 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
        if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-                sb->s_dirt = 0;
                ext3_commit_super(sb, es, 1);
        }
        unlock_super(sb);
@@ -2413,29 +2410,14 @@ int ext3_force_commit(struct super_block *sb)
                return 0;
        journal = EXT3_SB(sb)->s_journal;
-        sb->s_dirt = 0;
        ret = ext3_journal_force_commit(journal);
        return ret;
 }
-/*
- * Ext3 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
- */
-static void ext3_write_super (struct super_block * sb)
-{
-        if (mutex_trylock(&sb->s_lock) != 0)
-                BUG();
-        sb->s_dirt = 0;
-}
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
        tid_t target;
-        sb->s_dirt = 0;
        if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
                if (wait)
                        log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2451,7 +2433,6 @@ static int ext3_freeze(struct super_block *sb)
 {
        int error = 0;
        journal_t *journal;
-        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY)) {
                journal = EXT3_SB(sb)->s_journal;
@@ -2509,7 +2490,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        int i;
 #endif
+        lock_kernel();
        /* Store the original options */
+        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_resuid = sbi->s_resuid;
@@ -2617,6 +2601,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
                        kfree(old_opts.s_qf_names[i]);
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
@@ -2633,6 +2619,8 @@ restore_opts:
                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
        }
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return err;
 }
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 83b7be849bd5..545e37c4b91e 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -463,7 +463,6 @@ static void ext3_xattr_update_super_block(handle_t *handle,
        if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
                EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
-                sb->s_dirt = 1;
                ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        }
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f016707597a7..012c4251397e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -576,6 +576,11 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        lock_super(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                ext4_commit_super(sb, 1);
        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
@@ -642,8 +647,6 @@ static void ext4_put_super(struct super_block *sb)
        unlock_super(sb);
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
-        lock_super(sb);
-        lock_kernel();
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
 }
@@ -3333,7 +3336,9 @@ int ext4_force_commit(struct super_block *sb)
 static void ext4_write_super(struct super_block *sb)
 {
+        lock_super(sb);
        ext4_commit_super(sb, 1);
+        unlock_super(sb);
 }
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3417,7 +3422,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        int i;
 #endif
+        lock_kernel();
        /* Store the original options */
+        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_resuid = sbi->s_resuid;
@@ -3551,6 +3559,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
                        kfree(old_opts.s_qf_names[i]);
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return 0;
 restore_opts:
@@ -3570,6 +3580,8 @@ restore_opts:
                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
        }
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return err;
 }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 3a7f603b6982..f3500294eec5 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
-        .fsync          = file_fsync,
+        .fsync          = fat_file_fsync,
 };
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
@@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
                        de++;
                        nr_slots--;
                }
-                mark_buffer_dirty(bh);
+                mark_buffer_dirty_inode(bh, dir);
                if (IS_DIRSYNC(dir))
                        err = sync_dirty_buffer(bh);
                brelse(bh);
@@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
                de--;
                nr_slots--;
        }
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        if (IS_DIRSYNC(dir))
                err = sync_dirty_buffer(bh);
        brelse(bh);
@@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
                }
                memset(bhs[n]->b_data, 0, sb->s_blocksize);
                set_buffer_uptodate(bhs[n]);
-                mark_buffer_dirty(bhs[n]);
+                mark_buffer_dirty_inode(bhs[n], dir);
                n++;
                blknr++;
@@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
        de[0].size = de[1].size = 0;
        memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
        set_buffer_uptodate(bhs[0]);
-        mark_buffer_dirty(bhs[0]);
+        mark_buffer_dirty_inode(bhs[0], dir);
        err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
        if (err)
@@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
                        slots += copy;
                        size -= copy;
                        set_buffer_uptodate(bhs[n]);
-                        mark_buffer_dirty(bhs[n]);
+                        mark_buffer_dirty_inode(bhs[n], dir);
                        if (!size)
                                break;
                        n++;
@@ -1293,7 +1293,7 @@ found:
                for (i = 0; i < long_bhs; i++) {
                        int copy = min_t(int, sb->s_blocksize - offset, size);
                        memcpy(bhs[i]->b_data + offset, slots, copy);
-                        mark_buffer_dirty(bhs[i]);
+                        mark_buffer_dirty_inode(bhs[i], dir);
                        offset = 0;
                        slots += copy;
                        size -= copy;
@@ -1304,7 +1304,7 @@ found:
                        /* Fill the short name slot. */
                        int copy = min_t(int, sb->s_blocksize - offset, size);
                        memcpy(bhs[i]->b_data + offset, slots, copy);
-                        mark_buffer_dirty(bhs[i]);
+                        mark_buffer_dirty_inode(bhs[i], dir);
                        if (IS_DIRSYNC(dir))
                                err = sync_dirty_buffer(bhs[i]);
                }
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index ea440d65819c..e4d88527b5dd 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -74,6 +74,7 @@ struct msdos_sb_info {
        int fatent_shift;
        struct fatent_operations *fatent_ops;
+        struct inode *fat_inode;
        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];
@@ -251,6 +252,7 @@ struct fat_entry {
        } u;
        int nr_bhs;
        struct buffer_head *bhs[2];
+        struct inode *fat_inode;
 };
 static inline void fatent_init(struct fat_entry *fatent)
@@ -259,6 +261,7 @@ static inline void fatent_init(struct fat_entry *fatent)
        fatent->entry = 0;
        fatent->u.ent32_p = NULL;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
+        fatent->fat_inode = NULL;
 }
 static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
@@ -275,6 +278,7 @@ static inline void fatent_brelse(struct fat_entry *fatent)
                brelse(fatent->bhs[i]);
        fatent->nr_bhs = 0;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
+        fatent->fat_inode = NULL;
 }
 extern void fat_ent_access_init(struct super_block *sb);
@@ -296,6 +300,8 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate(struct inode *inode);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
                       struct kstat *stat);
+extern int fat_file_fsync(struct file *file, struct dentry *dentry,
+                          int datasync);
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index da6eea47872f..618f5305c2e4 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -73,6 +73,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
        struct buffer_head **bhs = fatent->bhs;
        WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
        bhs[0] = sb_bread(sb, blocknr);
        if (!bhs[0])
                goto err;
@@ -103,6 +105,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
        struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
        WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
        fatent->bhs[0] = sb_bread(sb, blocknr);
        if (!fatent->bhs[0]) {
                printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
@@ -167,9 +170,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
        }
        spin_unlock(&fat12_entry_lock);
-        mark_buffer_dirty(fatent->bhs[0]);
+        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
        if (fatent->nr_bhs == 2)
-                mark_buffer_dirty(fatent->bhs[1]);
+                mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
 }
 static void fat16_ent_put(struct fat_entry *fatent, int new)
@@ -178,7 +181,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
                new = EOF_FAT16;
        *fatent->u.ent16_p = cpu_to_le16(new);
-        mark_buffer_dirty(fatent->bhs[0]);
+        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 static void fat32_ent_put(struct fat_entry *fatent, int new)
@@ -189,7 +192,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new)
        WARN_ON(new & 0xf0000000);
        new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
        *fatent->u.ent32_p = cpu_to_le32(new);
-        mark_buffer_dirty(fatent->bhs[0]);
+        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 static int fat12_ent_next(struct fat_entry *fatent)
@@ -381,7 +384,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
                        }
                        memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
                        set_buffer_uptodate(c_bh);
-                        mark_buffer_dirty(c_bh);
+                        mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
                        if (sb->s_flags & MS_SYNCHRONOUS)
                                err = sync_dirty_buffer(c_bh);
                        brelse(c_bh);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 0a7f4a9918b3..e955a56b4e5e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -133,6 +133,18 @@ static int fat_file_release(struct inode *inode, struct file *filp)
        return 0;
 }
+int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        int res, err;
+        res = simple_fsync(filp, dentry, datasync);
+        err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+        return res ? res : err;
+}
 const struct file_operations fat_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -142,7 +154,7 @@ const struct file_operations fat_file_operations = {
        .mmap           = generic_file_mmap,
        .release        = fat_file_release,
        .ioctl          = fat_generic_ioctl,
-        .fsync          = file_fsync,
+        .fsync          = fat_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 296785a0dec8..51a5ecf9000a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -441,16 +441,35 @@ static void fat_clear_inode(struct inode *inode)
 static void fat_write_super(struct super_block *sb)
 {
+        lock_super(sb);
        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY))
                fat_clusters_flush(sb);
+        unlock_super(sb);
+}
+static int fat_sync_fs(struct super_block *sb, int wait)
+{
+        lock_super(sb);
+        fat_clusters_flush(sb);
+        sb->s_dirt = 0;
+        unlock_super(sb);
+        return 0;
 }
 static void fat_put_super(struct super_block *sb)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                fat_write_super(sb);
+        iput(sbi->fat_inode);
        if (sbi->nls_disk) {
                unload_nls(sbi->nls_disk);
                sbi->nls_disk = NULL;
@@ -467,6 +486,8 @@ static void fat_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
        kfree(sbi);
+        unlock_kernel();
 }
 static struct kmem_cache *fat_inode_cachep;
@@ -632,6 +653,7 @@ static const struct super_operations fat_sops = {
        .delete_inode   = fat_delete_inode,
        .put_super      = fat_put_super,
        .write_super    = fat_write_super,
+        .sync_fs        = fat_sync_fs,
        .statfs         = fat_statfs,
        .clear_inode    = fat_clear_inode,
        .remount_fs     = fat_remount,
@@ -1174,7 +1196,7 @@ static int fat_read_root(struct inode *inode)
 int fat_fill_super(struct super_block *sb, void *data, int silent,
                   const struct inode_operations *fs_dir_inode_ops, int isvfat)
 {
-        struct inode *root_inode = NULL;
+        struct inode *root_inode = NULL, *fat_inode = NULL;
        struct buffer_head *bh;
        struct fat_boot_sector *b;
        struct msdos_sb_info *sbi;
@@ -1414,6 +1436,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        }
        error = -ENOMEM;
+        fat_inode = new_inode(sb);
+        if (!fat_inode)
+                goto out_fail;
+        MSDOS_I(fat_inode)->i_pos = 0;
+        sbi->fat_inode = fat_inode;
        root_inode = new_inode(sb);
        if (!root_inode)
                goto out_fail;
@@ -1439,6 +1466,8 @@ out_invalid:
                       " on dev %s.\n", sb->s_id);
 out_fail:
+        if (fat_inode)
+                iput(fat_inode);
        if (root_inode)
                iput(root_inode);
        if (sbi->nls_io)
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index da3f361a37dd..20f522861355 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -544,7 +544,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
                int start = MSDOS_I(new_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                if (IS_DIRSYNC(new_dir)) {
                        err = sync_dirty_buffer(dotdot_bh);
                        if (err)
@@ -586,7 +586,7 @@ error_dotdot:
                int start = MSDOS_I(old_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                corrupt |= sync_dirty_buffer(dotdot_bh);
        }
 error_inode:
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a0e00e3a46e9..b50ecbe97f83 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -965,7 +965,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
                int start = MSDOS_I(new_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                if (IS_DIRSYNC(new_dir)) {
                        err = sync_dirty_buffer(dotdot_bh);
                        if (err)
@@ -1009,7 +1009,7 @@ error_dotdot:
                int start = MSDOS_I(old_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                corrupt |= sync_dirty_buffer(dotdot_bh);
        }
 error_inode:
diff --git a/fs/file_table.c b/fs/file_table.c
index 54018fe48840..334ce39881f8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
         */
        if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
                file_take_write(file);
-                error = mnt_want_write(mnt);
+                error = mnt_clone_write(mnt);
                WARN_ON(error);
        }
        return error;
@@ -399,6 +399,44 @@ too_bad:
        return 0;
 }
+/**
+ *      mark_files_ro - mark all files read-only
+ *      @sb: superblock in question
+ *
+ *      All files are marked read-only.  We don't care about pending
+ *      delete files so this should be used in 'force' mode only.
+ */
+void mark_files_ro(struct super_block *sb)
+{
+        struct file *f;
+retry:
+        file_list_lock();
+        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+                struct vfsmount *mnt;
+                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+                       continue;
+                if (!file_count(f))
+                        continue;
+                if (!(f->f_mode & FMODE_WRITE))
+                        continue;
+                f->f_mode &= ~FMODE_WRITE;
+                if (file_check_writeable(f) != 0)
+                        continue;
+                file_release_write(f);
+                mnt = mntget(f->f_path.mnt);
+                file_list_unlock();
+                /*
+                 * This can sleep, so we can't hold
+                 * the file_list_lock() spinlock.
+                 */
+                mnt_drop_write(mnt);
+                mntput(mnt);
+                goto retry;
+        }
+        file_list_unlock();
+}
 void __init files_init(unsigned long mempages)
 { 
        int n; 
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1dacda831577..cdbd1654e4cd 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -80,12 +80,16 @@ vxfs_put_super(struct super_block *sbp)
 {
        struct vxfs_sb_info     *infp = VXFS_SBI(sbp);
+        lock_kernel();
        vxfs_put_fake_inode(infp->vsi_fship);
        vxfs_put_fake_inode(infp->vsi_ilist);
        vxfs_put_fake_inode(infp->vsi_stilist);
        brelse(infp->vsi_bp);
        kfree(infp);
+        unlock_kernel();
 }
 /**
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91013ff7dd53..40308e98c6a4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -64,6 +64,28 @@ static void writeback_release(struct backing_dev_info *bdi)
        clear_bit(BDI_pdflush, &bdi->state);
 }
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+                struct dentry *dentry;
+                const char *name = "?";
+                dentry = d_find_alias(inode);
+                if (dentry) {
+                        spin_lock(&dentry->d_lock);
+                        name = (const char *) dentry->d_name.name;
+                }
+                printk(KERN_DEBUG
+                       "%s(%d): dirtied inode %lu (%s) on %s\n",
+                       current->comm, task_pid_nr(current), inode->i_ino,
+                       name, inode->i_sb->s_id);
+                if (dentry) {
+                        spin_unlock(&dentry->d_lock);
+                        dput(dentry);
+                }
+        }
+}
 /**
 *      __mark_inode_dirty -    internal function
 *      @inode: inode to mark
@@ -114,23 +136,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
        if ((inode->i_state & flags) == flags)
                return;
-        if (unlikely(block_dump)) {
+        if (unlikely(block_dump))
-                struct dentry *dentry = NULL;
+                block_dump___mark_inode_dirty(inode);
-                const char *name = "?";
-                if (!list_empty(&inode->i_dentry)) {
-                        dentry = list_entry(inode->i_dentry.next,
-                                            struct dentry, d_alias);
-                        if (dentry && dentry->d_name.name)
-                                name = (const char *) dentry->d_name.name;
-                }
-                if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev"))
-                        printk(KERN_DEBUG
-                               "%s(%d): dirtied inode %lu (%s) on %s\n",
-                               current->comm, task_pid_nr(current), inode->i_ino,
-                               name, inode->i_sb->s_id);
-        }
        spin_lock(&inode_lock);
        if ((inode->i_state & flags) != flags) {
@@ -289,7 +296,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
        int ret;
        BUG_ON(inode->i_state & I_SYNC);
-        WARN_ON(inode->i_state & I_NEW);
        /* Set I_SYNC, reset I_DIRTY */
        dirty = inode->i_state & I_DIRTY;
@@ -314,7 +320,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
        }
        spin_lock(&inode_lock);
-        WARN_ON(inode->i_state & I_NEW);
        inode->i_state &= ~I_SYNC;
        if (!(inode->i_state & I_FREEING)) {
                if (!(inode->i_state & I_DIRTY) &&
@@ -679,55 +684,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 }
 /**
- * sync_inodes - writes all inodes to disk
- * @wait: wait for completion
- *
- * sync_inodes() goes through each super block's dirty inode list, writes the
- * inodes out, waits on the writeout and puts the inodes back on the normal
- * list.
- *
- * This is for sys_sync().  fsync_dev() uses the same algorithm.  The subtle
- * part of the sync functions is that the blockdev "superblock" is processed
- * last.  This is because the write_inode() function of a typical fs will
- * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
- * What we want to do is to perform all that dirtying first, and then write
- * back all those inode blocks via the blockdev mapping in one sweep.  So the
- * additional (somewhat redundant) sync_blockdev() calls here are to make
- * sure that really happens.  Because if we call sync_inodes_sb(wait=1) with
- * outstanding dirty inodes, the writeback goes block-at-a-time within the
- * filesystem's write_inode().  This is extremely slow.
- */
-static void __sync_inodes(int wait)
-{
-        struct super_block *sb;
-        spin_lock(&sb_lock);
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root) {
-                        sync_inodes_sb(sb, wait);
-                        sync_blockdev(sb->s_bdev);
-                }
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-}
-void sync_inodes(int wait)
-{
-        __sync_inodes(0);
-        if (wait)
-                __sync_inodes(1);
-}
-/**
 * write_inode_now      -       write an inode to disk
 * @inode: inode to write to disk
 * @sync: whether the write should be synchronous or not
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index aa62cf5976e8..f2e449c595b4 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -764,7 +764,6 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        }
        gfs2_log_unlock(sdp);
-        sdp->sd_vfs->s_dirt = 0;
        up_write(&sdp->sd_log_flush_lock);
        kfree(ai);
@@ -823,7 +822,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        log_refund(sdp, tr);
        buf_lo_incore_commit(sdp, tr);
-        sdp->sd_vfs->s_dirt = 1;
        up_read(&sdp->sd_log_flush_lock);
        gfs2_log_lock(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 40bcc37e5a70..c8930b31cdf0 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -719,6 +719,8 @@ static void gfs2_put_super(struct super_block *sb)
        int error;
        struct gfs2_jdesc *jd;
+        lock_kernel();
        /*  Unfreeze the filesystem, if we need to  */
        mutex_lock(&sdp->sd_freeze_lock);
@@ -785,17 +787,8 @@ restart:
        /*  At this point, we're through participating in the lockspace  */
        gfs2_sys_fs_del(sdp);
-}
-/**
- * gfs2_write_super
- * @sb: the superblock
- *
- */
-static void gfs2_write_super(struct super_block *sb)
+        unlock_kernel();
-{
-        sb->s_dirt = 0;
 }
 /**
@@ -807,7 +800,6 @@ static void gfs2_write_super(struct super_block *sb)
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
-        sb->s_dirt = 0;
        if (wait && sb->s_fs_info)
                gfs2_log_flush(sb->s_fs_info, NULL);
        return 0;
@@ -1324,7 +1316,6 @@ const struct super_operations gfs2_super_ops = {
        .write_inode            = gfs2_write_inode,
        .delete_inode           = gfs2_delete_inode,
        .put_super              = gfs2_put_super,
-        .write_super            = gfs2_write_super,
        .sync_fs                = gfs2_sync_fs,
        .freeze_fs              = gfs2_freeze,
        .unfreeze_fs            = gfs2_unfreeze,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index a36bb749926d..6f833dc8e910 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -49,11 +49,23 @@ MODULE_LICENSE("GPL");
 */
 static void hfs_write_super(struct super_block *sb)
 {
+        lock_super(sb);
        sb->s_dirt = 0;
-        if (sb->s_flags & MS_RDONLY)
-                return;
        /* sync everything to the buffers */
+        if (!(sb->s_flags & MS_RDONLY))
+                hfs_mdb_commit(sb);
+        unlock_super(sb);
+}
+static int hfs_sync_fs(struct super_block *sb, int wait)
+{
+        lock_super(sb);
        hfs_mdb_commit(sb);
+        sb->s_dirt = 0;
+        unlock_super(sb);
+        return 0;
 }
 /*
@@ -65,9 +77,15 @@ static void hfs_write_super(struct super_block *sb)
 */
 static void hfs_put_super(struct super_block *sb)
 {
+        lock_kernel();
+        if (sb->s_dirt)
+                hfs_write_super(sb);
        hfs_mdb_close(sb);
        /* release the MDB's resources */
        hfs_mdb_put(sb);
+        unlock_kernel();
 }
 /*
@@ -164,6 +182,7 @@ static const struct super_operations hfs_super_operations = {
        .clear_inode    = hfs_clear_inode,
        .put_super      = hfs_put_super,
        .write_super    = hfs_write_super,
+        .sync_fs        = hfs_sync_fs,
        .statfs         = hfs_statfs,
        .remount_fs     = hfs_remount,
        .show_options   = hfs_show_options,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index f2a64020f42e..9fc3af0c0dab 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -152,15 +152,14 @@ static void hfsplus_clear_inode(struct inode *inode)
        }
 }
-static void hfsplus_write_super(struct super_block *sb)
+static int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
        struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
        dprint(DBG_SUPER, "hfsplus_write_super\n");
+        lock_super(sb);
        sb->s_dirt = 0;
-        if (sb->s_flags & MS_RDONLY)
-                /* warn? */
-                return;
        vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
        vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -192,6 +191,16 @@ static void hfsplus_write_super(struct super_block *sb)
                }
                HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
        }
+        unlock_super(sb);
+        return 0;
+}
+static void hfsplus_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                hfsplus_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static void hfsplus_put_super(struct super_block *sb)
@@ -199,6 +208,11 @@ static void hfsplus_put_super(struct super_block *sb)
        dprint(DBG_SUPER, "hfsplus_put_super\n");
        if (!sb->s_fs_info)
                return;
+        lock_kernel();
+        if (sb->s_dirt)
+                hfsplus_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
                struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
@@ -218,6 +232,8 @@ static void hfsplus_put_super(struct super_block *sb)
                unload_nls(HFSPLUS_SB(sb).nls);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -279,6 +295,7 @@ static const struct super_operations hfsplus_sops = {
        .clear_inode    = hfsplus_clear_inode,
        .put_super      = hfsplus_put_super,
        .write_super    = hfsplus_write_super,
+        .sync_fs        = hfsplus_sync_fs,
        .statfs         = hfsplus_statfs,
        .remount_fs     = hfsplus_remount,
        .show_options   = hfsplus_show_options,
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fc77965be841..f2feaa06bf26 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
@@ -99,11 +100,16 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
 static void hpfs_put_super(struct super_block *s)
 {
        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        lock_kernel();
        kfree(sbi->sb_cp_table);
        kfree(sbi->sb_bmp_dir);
        unmark_dirty(s);
        s->s_fs_info = NULL;
        kfree(sbi);
+        unlock_kernel();
 }
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -393,6 +399,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        
        *flags |= MS_NOATIME;
        
+        lock_kernel();
+        lock_super(s);
        uid = sbi->sb_uid; gid = sbi->sb_gid;
        umask = 0777 & ~sbi->sb_mode;
        lowercase = sbi->sb_lowercase; conv = sbi->sb_conv;
@@ -425,9 +433,13 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        replace_mount_options(s, new_opts);
+        unlock_super(s);
+        unlock_kernel();
        return 0;
 out_err:
+        unlock_super(s);
+        unlock_kernel();
        kfree(new_opts);
        return -EINVAL;
 }
diff --git a/fs/inode.c b/fs/inode.c
index bca0c618fdb3..a88baebf77cf 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
@@ -189,6 +190,10 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_private = NULL;
        inode->i_mapping = mapping;
+#ifdef CONFIG_FSNOTIFY
+        inode->i_fsnotify_mask = 0;
+#endif
        return inode;
 out_free_security:
@@ -221,6 +226,7 @@ void destroy_inode(struct inode *inode)
        BUG_ON(inode_has_buffers(inode));
        ima_inode_free(inode);
        security_inode_free(inode);
+        fsnotify_inode_delete(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
@@ -252,6 +258,9 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->inotify_watches);
        mutex_init(&inode->inotify_mutex);
 #endif
+#ifdef CONFIG_FSNOTIFY
+        INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+#endif
 }
 EXPORT_SYMBOL(inode_init_once);
@@ -398,6 +407,7 @@ int invalidate_inodes(struct super_block *sb)
        mutex_lock(&iprune_mutex);
        spin_lock(&inode_lock);
        inotify_unmount_inodes(&sb->s_inodes);
+        fsnotify_unmount_inodes(&sb->s_inodes);
        busy = invalidate_list(&sb->s_inodes, &throw_away);
        spin_unlock(&inode_lock);
@@ -1412,7 +1422,7 @@ void file_update_time(struct file *file)
        if (IS_NOCMTIME(inode))
                return;
-        err = mnt_want_write(file->f_path.mnt);
+        err = mnt_want_write_file(file);
        if (err)
                return;
diff --git a/fs/internal.h b/fs/internal.h
index b4dac4fb6b61..d55ef562f0bb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
        return sb == blockdev_superblock;
 }
+extern int __sync_blockdev(struct block_device *bdev, int wait);
 #else
 static inline void bdev_cache_init(void)
 {
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 {
        return 0;
 }
+static inline int __sync_blockdev(struct block_device *bdev, int wait)
+{
+        return 0;
+}
 #endif
 /*
@@ -66,3 +73,13 @@ extern void __init mnt_init(void);
 * fs_struct.c
 */
 extern void chroot_fs_refs(struct path *, struct path *);
+/*
+ * file_table.c
+ */
+extern void mark_files_ro(struct super_block *);
+/*
+ * super.c
+ */
+extern int do_remount_sb(struct super_block *, int, void *, int);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index b4cbe9603c7d..068b34b5a107 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -42,11 +42,16 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst
 static void isofs_put_super(struct super_block *sb)
 {
        struct isofs_sb_info *sbi = ISOFS_SB(sb);
 #ifdef CONFIG_JOLIET
+        lock_kernel();
        if (sbi->s_nls_iocharset) {
                unload_nls(sbi->s_nls_iocharset);
                sbi->s_nls_iocharset = NULL;
        }
+        unlock_kernel();
 #endif
        kfree(sbi);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 249305d65d5b..3451a81b2142 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -20,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 #include "nodelist.h"
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -387,6 +388,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
           This also catches the case where it was stopped and this
           is just a remount to restart it.
           Flush the writebuffer, if neccecary, else we loose it */
+        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY)) {
                jffs2_stop_garbage_collect_thread(c);
                mutex_lock(&c->alloc_sem);
@@ -399,24 +401,10 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
        *flags |= MS_NOATIME;
+        unlock_kernel();
        return 0;
 }
-void jffs2_write_super (struct super_block *sb)
-{
-        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-        sb->s_dirt = 0;
-        if (sb->s_flags & MS_RDONLY)
-                return;
-        D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-        jffs2_garbage_collect_trigger(c);
-        jffs2_erase_pending_blocks(c, 0);
-        jffs2_flush_wbuf_gc(c, 0);
-}
 /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
   fill in the raw_inode while you're at it. */
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 5e194a5c8e29..2228380c47b9 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -181,7 +181,6 @@ void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
                               struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
-void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
 void jffs2_gc_release_inode(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 4c4e18c54a51..07a22caf2687 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -53,10 +53,29 @@ static void jffs2_i_init_once(void *foo)
        inode_init_once(&f->vfs_inode);
 }
+static void jffs2_write_super(struct super_block *sb)
+{
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+        lock_super(sb);
+        sb->s_dirt = 0;
+        if (!(sb->s_flags & MS_RDONLY)) {
+                D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+                jffs2_garbage_collect_trigger(c);
+                jffs2_erase_pending_blocks(c, 0);
+                jffs2_flush_wbuf_gc(c, 0);
+        }
+        unlock_super(sb);
+}
 static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+        jffs2_write_super(sb);
        mutex_lock(&c->alloc_sem);
        jffs2_flush_wbuf_pad(c);
        mutex_unlock(&c->alloc_sem);
@@ -174,6 +193,11 @@ static void jffs2_put_super (struct super_block *sb)
        D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
+        lock_kernel();
+        if (sb->s_dirt)
+                jffs2_write_super(sb);
        mutex_lock(&c->alloc_sem);
        jffs2_flush_wbuf_pad(c);
        mutex_unlock(&c->alloc_sem);
@@ -192,6 +216,8 @@ static void jffs2_put_super (struct super_block *sb)
        if (c->mtd->sync)
                c->mtd->sync(c->mtd);
+        unlock_kernel();
        D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d9b0e92b3602..09b1b6ee2186 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -32,6 +32,7 @@
 #include <linux/crc32.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -183,6 +184,9 @@ static void jfs_put_super(struct super_block *sb)
        int rc;
        jfs_info("In jfs_put_super");
+        lock_kernel();
        rc = jfs_umount(sb);
        if (rc)
                jfs_err("jfs_umount failed with return code %d", rc);
@@ -195,6 +199,8 @@ static void jfs_put_super(struct super_block *sb)
        sbi->direct_inode = NULL;
        kfree(sbi);
+        unlock_kernel();
 }
 enum {
@@ -370,19 +376,24 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
        s64 newLVSize = 0;
        int rc = 0;
        int flag = JFS_SBI(sb)->flag;
+        int ret;
        if (!parse_options(data, sb, &newLVSize, &flag)) {
                return -EINVAL;
        }
+        lock_kernel();
        if (newLVSize) {
                if (sb->s_flags & MS_RDONLY) {
                        printk(KERN_ERR
                  "JFS: resize requires volume to be mounted read-write\n");
+                        unlock_kernel();
                        return -EROFS;
                }
                rc = jfs_extendfs(sb, newLVSize, 0);
-                if (rc)
+                if (rc) {
+                        unlock_kernel();
                        return rc;
+                }
        }
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -393,23 +404,31 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
                truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
                JFS_SBI(sb)->flag = flag;
-                return jfs_mount_rw(sb, 1);
+                ret = jfs_mount_rw(sb, 1);
+                unlock_kernel();
+                return ret;
        }
        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
                rc = jfs_umount_rw(sb);
                JFS_SBI(sb)->flag = flag;
+                unlock_kernel();
                return rc;
        }
        if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
                if (!(sb->s_flags & MS_RDONLY)) {
                        rc = jfs_umount_rw(sb);
-                        if (rc)
+                        if (rc) {
+                                unlock_kernel();
                                return rc;
+                        }
                        JFS_SBI(sb)->flag = flag;
-                        return jfs_mount_rw(sb, 1);
+                        ret = jfs_mount_rw(sb, 1);
+                        unlock_kernel();
+                        return ret;
                }
        JFS_SBI(sb)->flag = flag;
+        unlock_kernel();
        return 0;
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index 80046ddf5063..ddfa89948c3f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -9,6 +9,8 @@
 #include <linux/vfs.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 #include <asm/uaccess.h>
@@ -807,6 +809,29 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
+int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = 0, /* metadata-only; caller takes care of data */
+        };
+        struct inode *inode = dentry->d_inode;
+        int err;
+        int ret;
+        ret = sync_mapping_buffers(inode->i_mapping);
+        if (!(inode->i_state & I_DIRTY))
+                return ret;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                return ret;
+        err = sync_inode(inode, &wbc);
+        if (ret == 0)
+                ret = err;
+        return ret;
+}
+EXPORT_SYMBOL(simple_fsync);
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d4946c4c90e2..e5f206467e40 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ static int minix_readdir(struct file *, void *, filldir_t);
 const struct file_operations minix_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = minix_readdir,
-        .fsync          = minix_sync_file,
+        .fsync          = simple_fsync,
 };
 static inline void dir_put_page(struct page *page)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 17765f697e50..3eec3e607a87 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -6,15 +6,12 @@
 *  minix regular file handling primitives
 */
-#include <linux/buffer_head.h>          /* for fsync_inode_buffers() */
 #include "minix.h"
 /*
 * We have mostly NULLs here: the current defaults are OK for
 * the minix filesystem.
 */
-int minix_sync_file(struct file *, struct dentry *, int);
 const struct file_operations minix_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -22,7 +19,7 @@ const struct file_operations minix_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = minix_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -30,18 +27,3 @@ const struct inode_operations minix_file_inode_operations = {
        .truncate       = minix_truncate,
        .getattr        = minix_getattr,
 };
-int minix_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        
-        err |= minix_sync_inode(inode);
-        return err ? -EIO : 0;
-}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index daad3c2740db..f91a23693597 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -35,6 +35,8 @@ static void minix_put_super(struct super_block *sb)
        int i;
        struct minix_sb_info *sbi = minix_sb(sb);
+        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY)) {
                if (sbi->s_version != MINIX_V3)  /* s_state is now out from V3 sb */
                        sbi->s_ms->s_state = sbi->s_mount_state;
@@ -49,7 +51,7 @@ static void minix_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
        kfree(sbi);
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache * minix_inode_cachep;
@@ -554,38 +556,25 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
        return bh;
 }
-static struct buffer_head *minix_update_inode(struct inode *inode)
+static int minix_write_inode(struct inode *inode, int wait)
-{
-        if (INODE_VERSION(inode) == MINIX_V1)
-                return V1_minix_update_inode(inode);
-        else
-                return V2_minix_update_inode(inode);
-}
-static int minix_write_inode(struct inode * inode, int wait)
-{
-        brelse(minix_update_inode(inode));
-        return 0;
-}
-int minix_sync_inode(struct inode * inode)
 {
        int err = 0;
        struct buffer_head *bh;
-        bh = minix_update_inode(inode);
+        if (INODE_VERSION(inode) == MINIX_V1)
-        if (bh && buffer_dirty(bh))
+                bh = V1_minix_update_inode(inode);
-        {
+        else
+                bh = V2_minix_update_inode(inode);
+        if (!bh)
+                return -EIO;
+        if (wait && buffer_dirty(bh)) {
                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh))
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                {
                        printk("IO error syncing minix inode [%s:%08lx]\n",
                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
+                        err = -EIO;
                }
        }
-        else if (!bh)
-                err = -1;
        brelse (bh);
        return err;
 }
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index e6a0b193bea4..cb7fdd11f9a5 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -57,7 +57,6 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping,
 extern void V1_minix_truncate(struct inode *);
 extern void V2_minix_truncate(struct inode *);
 extern void minix_truncate(struct inode *);
-extern int minix_sync_inode(struct inode *);
 extern void minix_set_inode(struct inode *, dev_t);
 extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
 extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
@@ -72,7 +71,6 @@ extern int minix_empty_dir(struct inode*);
 extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
 extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
 extern ino_t minix_inode_by_name(struct dentry*);
-extern int minix_sync_file(struct file *, struct dentry *, int);
 extern const struct inode_operations minix_file_inode_operations;
 extern const struct inode_operations minix_dir_inode_operations;
diff --git a/fs/namei.c b/fs/namei.c
index c82805d088e1..527119afb6a5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -552,6 +552,17 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
        return result;
 }
+static __always_inline void set_root(struct nameidata *nd)
+{
+        if (!nd->root.mnt) {
+                struct fs_struct *fs = current->fs;
+                read_lock(&fs->lock);
+                nd->root = fs->root;
+                path_get(&nd->root);
+                read_unlock(&fs->lock);
+        }
+}
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
        int res = 0;
@@ -560,14 +571,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                goto fail;
        if (*link == '/') {
-                struct fs_struct *fs = current->fs;
+                set_root(nd);
                path_put(&nd->path);
+                nd->path = nd->root;
-                read_lock(&fs->lock);
+                path_get(&nd->root);
-                nd->path = fs->root;
-                path_get(&fs->root);
-                read_unlock(&fs->lock);
        }
        res = link_path_walk(link, nd);
@@ -668,23 +675,23 @@ loop:
        return err;
 }
-int follow_up(struct vfsmount **mnt, struct dentry **dentry)
+int follow_up(struct path *path)
 {
        struct vfsmount *parent;
        struct dentry *mountpoint;
        spin_lock(&vfsmount_lock);
-        parent=(*mnt)->mnt_parent;
+        parent = path->mnt->mnt_parent;
-        if (parent == *mnt) {
+        if (parent == path->mnt) {
                spin_unlock(&vfsmount_lock);
                return 0;
        }
        mntget(parent);
-        mountpoint=dget((*mnt)->mnt_mountpoint);
+        mountpoint = dget(path->mnt->mnt_mountpoint);
        spin_unlock(&vfsmount_lock);
-        dput(*dentry);
+        dput(path->dentry);
-        *dentry = mountpoint;
+        path->dentry = mountpoint;
-        mntput(*mnt);
+        mntput(path->mnt);
-        *mnt = parent;
+        path->mnt = parent;
        return 1;
 }
@@ -695,7 +702,7 @@ static int __follow_mount(struct path *path)
 {
        int res = 0;
        while (d_mountpoint(path->dentry)) {
-                struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
+                struct vfsmount *mounted = lookup_mnt(path);
                if (!mounted)
                        break;
                dput(path->dentry);
@@ -708,32 +715,32 @@ static int __follow_mount(struct path *path)
        return res;
 }
-static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static void follow_mount(struct path *path)
 {
-        while (d_mountpoint(*dentry)) {
+        while (d_mountpoint(path->dentry)) {
-                struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
+                struct vfsmount *mounted = lookup_mnt(path);
                if (!mounted)
                        break;
-                dput(*dentry);
+                dput(path->dentry);
-                mntput(*mnt);
+                mntput(path->mnt);
-                *mnt = mounted;
+                path->mnt = mounted;
-                *dentry = dget(mounted->mnt_root);
+                path->dentry = dget(mounted->mnt_root);
        }
 }
 /* no need for dcache_lock, as serialization is taken care in
 * namespace.c
 */
-int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+int follow_down(struct path *path)
 {
        struct vfsmount *mounted;
-        mounted = lookup_mnt(*mnt, *dentry);
+        mounted = lookup_mnt(path);
        if (mounted) {
-                dput(*dentry);
+                dput(path->dentry);
-                mntput(*mnt);
+                mntput(path->mnt);
-                *mnt = mounted;
+                path->mnt = mounted;
-                *dentry = dget(mounted->mnt_root);
+                path->dentry = dget(mounted->mnt_root);
                return 1;
        }
        return 0;
@@ -741,19 +748,16 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 static __always_inline void follow_dotdot(struct nameidata *nd)
 {
-        struct fs_struct *fs = current->fs;
+        set_root(nd);
        while(1) {
                struct vfsmount *parent;
                struct dentry *old = nd->path.dentry;
-                read_lock(&fs->lock);
+                if (nd->path.dentry == nd->root.dentry &&
-                if (nd->path.dentry == fs->root.dentry &&
+                    nd->path.mnt == nd->root.mnt) {
-                    nd->path.mnt == fs->root.mnt) {
-                        read_unlock(&fs->lock);
                        break;
                }
-                read_unlock(&fs->lock);
                spin_lock(&dcache_lock);
                if (nd->path.dentry != nd->path.mnt->mnt_root) {
                        nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -775,7 +779,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
                mntput(nd->path.mnt);
                nd->path.mnt = parent;
        }
-        follow_mount(&nd->path.mnt, &nd->path.dentry);
+        follow_mount(&nd->path);
 }
 /*
@@ -1017,25 +1021,23 @@ static int path_walk(const char *name, struct nameidata *nd)
        return link_path_walk(name, nd);
 }
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
-static int do_path_lookup(int dfd, const char *name,
-                                unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
        int fput_needed;
        struct file *file;
-        struct fs_struct *fs = current->fs;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
        nd->flags = flags;
        nd->depth = 0;
+        nd->root.mnt = NULL;
        if (*name=='/') {
-                read_lock(&fs->lock);
+                set_root(nd);
-                nd->path = fs->root;
+                nd->path = nd->root;
-                path_get(&fs->root);
+                path_get(&nd->root);
-                read_unlock(&fs->lock);
        } else if (dfd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
                read_lock(&fs->lock);
                nd->path = fs->pwd;
                path_get(&fs->pwd);
@@ -1063,17 +1065,29 @@ static int do_path_lookup(int dfd, const char *name,
                fput_light(file, fput_needed);
        }
+        return 0;
-        retval = path_walk(name, nd);
+fput_fail:
+        fput_light(file, fput_needed);
+out_fail:
+        return retval;
+}
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int do_path_lookup(int dfd, const char *name,
+                                unsigned int flags, struct nameidata *nd)
+{
+        int retval = path_init(dfd, name, flags, nd);
+        if (!retval)
+                retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
                                nd->path.dentry->d_inode))
                audit_inode(name, nd->path.dentry);
-out_fail:
+        if (nd->root.mnt) {
+                path_put(&nd->root);
+                nd->root.mnt = NULL;
+        }
        return retval;
-fput_fail:
-        fput_light(file, fput_needed);
-        goto out_fail;
 }
 int path_lookup(const char *name, unsigned int flags,
@@ -1113,14 +1127,18 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        nd->path.dentry = dentry;
        nd->path.mnt = mnt;
        path_get(&nd->path);
+        nd->root = nd->path;
+        path_get(&nd->root);
        retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
                                nd->path.dentry->d_inode))
                audit_inode(name, nd->path.dentry);
-        return retval;
+        path_put(&nd->root);
+        nd->root.mnt = NULL;
+        return retval;
 }
 /**
@@ -1676,9 +1694,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
        /*
         * Create - we need to know the parent.
         */
-        error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
        if (error)
                return ERR_PTR(error);
+        error = path_walk(pathname, &nd);
+        if (error)
+                return ERR_PTR(error);
+        if (unlikely(!audit_dummy_context()))
+                audit_inode(pathname, nd.path.dentry);
        /*
         * We have the parent and last component. First of all, check
@@ -1806,6 +1829,8 @@ exit:
        if (!IS_ERR(nd.intent.open.file))
                release_open_intent(&nd);
 exit_parent:
+        if (nd.root.mnt)
+                path_put(&nd.root);
        path_put(&nd.path);
        return ERR_PTR(error);
diff --git a/fs/namespace.c b/fs/namespace.c
index 134d494158d9..2dd333b0fe7f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
-                atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+                mnt->mnt_writers = alloc_percpu(int);
+                if (!mnt->mnt_writers)
+                        goto out_free_devname;
+#else
+                mnt->mnt_writers = 0;
+#endif
        }
        return mnt;
+#ifdef CONFIG_SMP
+out_free_devname:
+        kfree(mnt->mnt_devname);
+#endif
 out_free_id:
        mnt_free_id(mnt);
 out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-struct mnt_writer {
+static inline void inc_mnt_writers(struct vfsmount *mnt)
-        /*
+{
-         * If holding multiple instances of this lock, they
+#ifdef CONFIG_SMP
-         * must be ordered by cpu number.
+        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
-         */
+#else
-        spinlock_t lock;
+        mnt->mnt_writers++;
-        struct lock_class_key lock_class; /* compiles out with !lockdep */
+#endif
-        unsigned long count;
+}
-        struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
 {
-        int cpu;
+#ifdef CONFIG_SMP
-        for_each_possible_cpu(cpu) {
+        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
-                struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+#else
-                spin_lock_init(&writer->lock);
+        mnt->mnt_writers--;
-                lockdep_set_class(&writer->lock, &writer->lock_class);
+#endif
-                writer->count = 0;
-        }
-        return 0;
 }
-fs_initcall(init_mnt_writers);
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
+        unsigned int count = 0;
        int cpu;
-        struct mnt_writer *cpu_writer;
        for_each_possible_cpu(cpu) {
-                cpu_writer = &per_cpu(mnt_writers, cpu);
+                count += *per_cpu_ptr(mnt->mnt_writers, cpu);
-                spin_unlock(&cpu_writer->lock);
        }
-}
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+        return count;
-{
+#else
-        if (!cpu_writer->mnt)
+        return mnt->mnt_writers;
-                return;
+#endif
-        /*
-         * This is in case anyone ever leaves an invalid,
-         * old ->mnt and a count of 0.
-         */
-        if (!cpu_writer->count)
-                return;
-        atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
-        cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
-                                          struct vfsmount *mnt)
-{
-        if (cpu_writer->mnt == mnt)
-                return;
-        __clear_mnt_count(cpu_writer);
-        cpu_writer->mnt = mnt;
 }
 /*
@@ -253,74 +236,73 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
 int mnt_want_write(struct vfsmount *mnt)
 {
        int ret = 0;
-        struct mnt_writer *cpu_writer;
-        cpu_writer = &get_cpu_var(mnt_writers);
+        preempt_disable();
-        spin_lock(&cpu_writer->lock);
+        inc_mnt_writers(mnt);
+        /*
+         * The store to inc_mnt_writers must be visible before we pass
+         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+         * incremented count after it has set MNT_WRITE_HOLD.
+         */
+        smp_mb();
+        while (mnt->mnt_flags & MNT_WRITE_HOLD)
+                cpu_relax();
+        /*
+         * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+         * be set to match its requirements. So we must not load that until
+         * MNT_WRITE_HOLD is cleared.
+         */
+        smp_rmb();
        if (__mnt_is_readonly(mnt)) {
+                dec_mnt_writers(mnt);
                ret = -EROFS;
                goto out;
        }
-        use_cpu_writer_for_mount(cpu_writer, mnt);
-        cpu_writer->count++;
 out:
-        spin_unlock(&cpu_writer->lock);
+        preempt_enable();
-        put_cpu_var(mnt_writers);
        return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
-static void lock_mnt_writers(void)
+/**
-{
+ * mnt_clone_write - get write access to a mount
-        int cpu;
+ * @mnt: the mount on which to take a write
-        struct mnt_writer *cpu_writer;
+ *
+ * This is effectively like mnt_want_write, except
-        for_each_possible_cpu(cpu) {
+ * it must only be used to take an extra write reference
-                cpu_writer = &per_cpu(mnt_writers, cpu);
+ * on a mountpoint that we already know has a write reference
-                spin_lock(&cpu_writer->lock);
+ * on it. This allows some optimisation.
-                __clear_mnt_count(cpu_writer);
+ *
-                cpu_writer->mnt = NULL;
+ * After finished, mnt_drop_write must be called as usual to
-        }
+ * drop the reference.
+ */
+int mnt_clone_write(struct vfsmount *mnt)
+{
+        /* superblock may be r/o */
+        if (__mnt_is_readonly(mnt))
+                return -EROFS;
+        preempt_disable();
+        inc_mnt_writers(mnt);
+        preempt_enable();
+        return 0;
 }
+EXPORT_SYMBOL_GPL(mnt_clone_write);
-/*
+/**
- * These per-cpu write counts are not guaranteed to have
+ * mnt_want_write_file - get write access to a file's mount
- * matched increments and decrements on any given cpu.
+ * @file: the file who's mount on which to take a write
- * A file open()ed for write on one cpu and close()d on
+ *
- * another cpu will imbalance this count.  Make sure it
+ * This is like mnt_want_write, but it takes a file and can
- * does not get too far out of whack.
+ * do some optimisations if the file is open for write already
 */
-static void handle_write_count_underflow(struct vfsmount *mnt)
+int mnt_want_write_file(struct file *file)
 {
-        if (atomic_read(&mnt->__mnt_writers) >=
+        if (!(file->f_mode & FMODE_WRITE))
-            MNT_WRITER_UNDERFLOW_LIMIT)
+                return mnt_want_write(file->f_path.mnt);
-                return;
+        else
-        /*
+                return mnt_clone_write(file->f_path.mnt);
-         * It isn't necessary to hold all of the locks
-         * at the same time, but doing it this way makes
-         * us share a lot more code.
-         */
-        lock_mnt_writers();
-        /*
-         * vfsmount_lock is for mnt_flags.
-         */
-        spin_lock(&vfsmount_lock);
-        /*
-         * If coalescing the per-cpu writer counts did not
-         * get us back to a positive writer count, we have
-         * a bug.
-         */
-        if ((atomic_read(&mnt->__mnt_writers) < 0) &&
-            !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-                WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
-                                "count: %d\n",
-                        mnt, atomic_read(&mnt->__mnt_writers));
-                /* use the flag to keep the dmesg spam down */
-                mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
-        }
-        spin_unlock(&vfsmount_lock);
-        unlock_mnt_writers();
 }
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
 /**
 * mnt_drop_write - give up write access to a mount
@@ -332,37 +314,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
 */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-        int must_check_underflow = 0;
+        preempt_disable();
-        struct mnt_writer *cpu_writer;
+        dec_mnt_writers(mnt);
+        preempt_enable();
-        cpu_writer = &get_cpu_var(mnt_writers);
-        spin_lock(&cpu_writer->lock);
-        use_cpu_writer_for_mount(cpu_writer, mnt);
-        if (cpu_writer->count > 0) {
-                cpu_writer->count--;
-        } else {
-                must_check_underflow = 1;
-                atomic_dec(&mnt->__mnt_writers);
-        }
-        spin_unlock(&cpu_writer->lock);
-        /*
-         * Logically, we could call this each time,
-         * but the __mnt_writers cacheline tends to
-         * be cold, and makes this expensive.
-         */
-        if (must_check_underflow)
-                handle_write_count_underflow(mnt);
-        /*
-         * This could be done right after the spinlock
-         * is taken because the spinlock keeps us on
-         * the cpu, and disables preemption.  However,
-         * putting it here bounds the amount that
-         * __mnt_writers can underflow.  Without it,
-         * we could theoretically wrap __mnt_writers.
-         */
-        put_cpu_var(mnt_writers);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -370,24 +324,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
        int ret = 0;
-        lock_mnt_writers();
+        spin_lock(&vfsmount_lock);
+        mnt->mnt_flags |= MNT_WRITE_HOLD;
        /*
-         * With all the locks held, this value is stable
+         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+         * should be visible before we do.
         */
-        if (atomic_read(&mnt->__mnt_writers) > 0) {
+        smp_mb();
-                ret = -EBUSY;
-                goto out;
-        }
        /*
-         * nobody can do a successful mnt_want_write() with all
+         * With writers on hold, if this value is zero, then there are
-         * of the counts in MNT_DENIED_WRITE and the locks held.
+         * definitely no active writers (although held writers may subsequently
+         * increment the count, they'll have to wait, and decrement it after
+         * seeing MNT_READONLY).
+         *
+         * It is OK to have counter incremented on one CPU and decremented on
+         * another: the sum will add up correctly. The danger would be when we
+         * sum up each counter, if we read a counter before it is incremented,
+         * but then read another CPU's count which it has been subsequently
+         * decremented from -- we would see more decrements than we should.
+         * MNT_WRITE_HOLD protects against this scenario, because
+         * mnt_want_write first increments count, then smp_mb, then spins on
+         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+         * we're counting up here.
         */
-        spin_lock(&vfsmount_lock);
+        if (count_mnt_writers(mnt) > 0)
-        if (!ret)
+                ret = -EBUSY;
+        else
                mnt->mnt_flags |= MNT_READONLY;
+        /*
+         * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+         * that become unheld will see MNT_READONLY.
+         */
+        smp_wmb();
+        mnt->mnt_flags &= ~MNT_WRITE_HOLD;
        spin_unlock(&vfsmount_lock);
-out:
-        unlock_mnt_writers();
        return ret;
 }
@@ -410,6 +381,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+        free_percpu(mnt->mnt_writers);
+#endif
        kmem_cache_free(mnt_cache, mnt);
 }
@@ -442,11 +416,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 * lookup_mnt increments the ref count before returning
 * the vfsmount struct.
 */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *lookup_mnt(struct path *path)
 {
        struct vfsmount *child_mnt;
        spin_lock(&vfsmount_lock);
-        if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
+        if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
                mntget(child_mnt);
        spin_unlock(&vfsmount_lock);
        return child_mnt;
@@ -604,38 +578,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 static inline void __mntput(struct vfsmount *mnt)
 {
-        int cpu;
        struct super_block *sb = mnt->mnt_sb;
        /*
-         * We don't have to hold all of the locks at the
-         * same time here because we know that we're the
-         * last reference to mnt and that no new writers
-         * can come in.
-         */
-        for_each_possible_cpu(cpu) {
-                struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-                spin_lock(&cpu_writer->lock);
-                if (cpu_writer->mnt != mnt) {
-                        spin_unlock(&cpu_writer->lock);
-                        continue;
-                }
-                atomic_add(cpu_writer->count, &mnt->__mnt_writers);
-                cpu_writer->count = 0;
-                /*
-                 * Might as well do this so that no one
-                 * ever sees the pointer and expects
-                 * it to be valid.
-                 */
-                cpu_writer->mnt = NULL;
-                spin_unlock(&cpu_writer->lock);
-        }
-        /*
         * This probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this
         * happens, the filesystem was probably unable
         * to make r/w->r/o transitions.
         */
-        WARN_ON(atomic_read(&mnt->__mnt_writers));
+        /*
+         * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+         * provides barriers, so count_mnt_writers() below is safe.  AV
+         */
+        WARN_ON(count_mnt_writers(mnt));
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
@@ -1106,11 +1060,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
                 * we just try to remount it readonly.
                 */
                down_write(&sb->s_umount);
-                if (!(sb->s_flags & MS_RDONLY)) {
+                if (!(sb->s_flags & MS_RDONLY))
-                        lock_kernel();
                        retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
-                        unlock_kernel();
-                }
                up_write(&sb->s_umount);
                return retval;
        }
@@ -1253,11 +1204,11 @@ Enomem:
        return NULL;
 }
-struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *collect_mounts(struct path *path)
 {
        struct vfsmount *tree;
        down_write(&namespace_sem);
-        tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
+        tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
        up_write(&namespace_sem);
        return tree;
 }
@@ -1430,7 +1381,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
                goto out_unlock;
        err = -ENOENT;
-        if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
+        if (!d_unlinked(path->dentry))
                err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
        mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1601,7 +1552,7 @@ static int do_move_mount(struct path *path, char *old_name)
        down_write(&namespace_sem);
        while (d_mountpoint(path->dentry) &&
-               follow_down(&path->mnt, &path->dentry))
+               follow_down(path))
                ;
        err = -EINVAL;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1612,7 +1563,7 @@ static int do_move_mount(struct path *path, char *old_name)
        if (IS_DEADDIR(path->dentry->d_inode))
                goto out1;
-        if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
+        if (d_unlinked(path->dentry))
                goto out1;
        err = -EINVAL;
@@ -1676,7 +1627,9 @@ static int do_new_mount(struct path *path, char *type, int flags,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        lock_kernel();
        mnt = do_kern_mount(type, flags, name, data);
+        unlock_kernel();
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
@@ -1695,10 +1648,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
        while (d_mountpoint(path->dentry) &&
-               follow_down(&path->mnt, &path->dentry))
+               follow_down(path))
                ;
        err = -EINVAL;
-        if (!check_mnt(path->mnt))
+        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
                goto unlock;
        /* Refuse the same filesystem on the same mount point */
@@ -2092,10 +2045,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
        if (retval < 0)
                goto out3;
-        lock_kernel();
        retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
                          flags, (void *)data_page);
-        unlock_kernel();
        free_page(data_page);
 out3:
@@ -2175,9 +2126,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        error = -ENOENT;
        if (IS_DEADDIR(new.dentry->d_inode))
                goto out2;
-        if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
+        if (d_unlinked(new.dentry))
                goto out2;
-        if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
+        if (d_unlinked(old.dentry))
                goto out2;
        error = -EBUSY;
        if (new.mnt == root.mnt ||
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d642f0e5b365..b99ce205b1bd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -736,6 +736,8 @@ static void ncp_put_super(struct super_block *sb)
 {
        struct ncp_server *server = NCP_SBP(sb);
+        lock_kernel();
        ncp_lock_server(server);
        ncp_disconnect(server);
        ncp_unlock_server(server);
@@ -769,6 +771,8 @@ static void ncp_put_super(struct super_block *sb)
        vfree(server->packet);
        sb->s_fs_info = NULL;
        kfree(server);
+        unlock_kernel();
 }
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 64a288ee046d..f01caec84463 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -154,7 +154,7 @@ out_err:
        goto out;
 out_follow:
        while (d_mountpoint(nd->path.dentry) &&
-               follow_down(&nd->path.mnt, &nd->path.dentry))
+               follow_down(&nd->path))
                ;
        err = 0;
        goto out;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2d67781c579..26127b69a275 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1813,6 +1813,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        if (data == NULL)
                return -ENOMEM;
+        lock_kernel();
        /* fill out struct with values from existing mount */
        data->flags = nfss->flags;
        data->rsize = nfss->rsize;
@@ -1837,6 +1838,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        error = nfs_compare_remount_data(nfss, data);
 out:
        kfree(data);
+        unlock_kernel();
        return error;
 }
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5839b229cd0e..8b1f8efb4690 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -847,9 +847,8 @@ exp_get_fsid_key(svc_client *clp, int fsid)
        return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
-static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
+static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
-                                   struct dentry *dentry,
+                                     struct cache_req *reqp)
-                                   struct cache_req *reqp)
 {
        struct svc_export *exp, key;
        int err;
@@ -858,8 +857,7 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
                return ERR_PTR(-ENOENT);
        key.ex_client = clp;
-        key.ex_path.mnt = mnt;
+        key.ex_path = *path;
-        key.ex_path.dentry = dentry;
        exp = svc_export_lookup(&key);
        if (exp == NULL)
@@ -873,24 +871,19 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
 /*
 * Find the export entry for a given dentry.
 */
-static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt,
+static struct svc_export *exp_parent(svc_client *clp, struct path *path)
-                                     struct dentry *dentry,
-                                     struct cache_req *reqp)
 {
-        svc_export *exp;
+        struct dentry *saved = dget(path->dentry);
+        svc_export *exp = exp_get_by_name(clp, path, NULL);
-        dget(dentry);
-        exp = exp_get_by_name(clp, mnt, dentry, reqp);
+        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+                struct dentry *parent = dget_parent(path->dentry);
-        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
+                dput(path->dentry);
-                struct dentry *parent;
+                path->dentry = parent;
+                exp = exp_get_by_name(clp, path, NULL);
-                parent = dget_parent(dentry);
-                dput(dentry);
-                dentry = parent;
-                exp = exp_get_by_name(clp, mnt, dentry, reqp);
        }
-        dput(dentry);
+        dput(path->dentry);
+        path->dentry = saved;
        return exp;
 }
@@ -1018,7 +1011,7 @@ exp_export(struct nfsctl_export *nxp)
                goto out_put_clp;
        err = -EINVAL;
-        exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL);
+        exp = exp_get_by_name(clp, &path, NULL);
        memset(&new, 0, sizeof(new));
@@ -1135,7 +1128,7 @@ exp_unexport(struct nfsctl_export *nxp)
                goto out_domain;
        err = -EINVAL;
-        exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL);
+        exp = exp_get_by_name(dom, &path, NULL);
        path_put(&path);
        if (IS_ERR(exp))
                goto out_domain;
@@ -1177,7 +1170,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
        dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
                 name, path.dentry, clp->name,
                 inode->i_sb->s_id, inode->i_ino);
-        exp = exp_parent(clp, path.mnt, path.dentry, NULL);
+        exp = exp_parent(clp, &path);
        if (IS_ERR(exp)) {
                err = PTR_ERR(exp);
                goto out;
@@ -1207,7 +1200,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type,
        if (IS_ERR(ek))
                return ERR_CAST(ek);
-        exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp);
+        exp = exp_get_by_name(clp, &ek->ek_path, reqp);
        cache_put(&ek->h, &svc_expkey_cache);
        if (IS_ERR(exp))
@@ -1247,8 +1240,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
 * use exp_get_by_name() or exp_find().
 */
 struct svc_export *
-rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
-                struct dentry *dentry)
 {
        struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
@@ -1256,8 +1248,7 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
                goto gss;
        /* First try the auth_unix client: */
-        exp = exp_get_by_name(rqstp->rq_client, mnt, dentry,
+        exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
-                                                &rqstp->rq_chandle);
        if (PTR_ERR(exp) == -ENOENT)
                goto gss;
        if (IS_ERR(exp))
@@ -1269,8 +1260,7 @@ gss:
        /* Otherwise, try falling back on gss client */
        if (rqstp->rq_gssclient == NULL)
                return exp;
-        gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry,
+        gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
-                                                &rqstp->rq_chandle);
        if (PTR_ERR(gssexp) == -ENOENT)
                return exp;
        if (!IS_ERR(exp))
@@ -1309,23 +1299,19 @@ gss:
 }
 struct svc_export *
-rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
+rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
-                struct dentry *dentry)
 {
-        struct svc_export *exp;
+        struct dentry *saved = dget(path->dentry);
+        struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
-        dget(dentry);
-        exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+                struct dentry *parent = dget_parent(path->dentry);
-        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
+                dput(path->dentry);
-                struct dentry *parent;
+                path->dentry = parent;
+                exp = rqst_exp_get_by_name(rqstp, path);
-                parent = dget_parent(dentry);
-                dput(dentry);
-                dentry = parent;
-                exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
        }
-        dput(dentry);
+        dput(path->dentry);
+        path->dentry = saved;
        return exp;
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index bd584bcf1d9f..99f835753596 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -101,36 +101,35 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 {
        struct svc_export *exp = *expp, *exp2 = NULL;
        struct dentry *dentry = *dpp;
-        struct vfsmount *mnt = mntget(exp->ex_path.mnt);
+        struct path path = {.mnt = mntget(exp->ex_path.mnt),
-        struct dentry *mounts = dget(dentry);
+                            .dentry = dget(dentry)};
        int err = 0;
-        while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
+        while (d_mountpoint(path.dentry) && follow_down(&path))
+                ;
-        exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
+        exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
                if (PTR_ERR(exp2) != -ENOENT)
                        err = PTR_ERR(exp2);
-                dput(mounts);
+                path_put(&path);
-                mntput(mnt);
                goto out;
        }
        if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
                /* successfully crossed mount point */
                /*
-                 * This is subtle: dentry is *not* under mnt at this point.
+                 * This is subtle: path.dentry is *not* on path.mnt
-                 * The only reason we are safe is that original mnt is pinned
+                 * at this point.  The only reason we are safe is that
-                 * down by exp, so we should dput before putting exp.
+                 * original mnt is pinned down by exp, so we should
+                 * put path *before* putting exp
                 */
-                dput(dentry);
+                *dpp = path.dentry;
-                *dpp = mounts;
+                path.dentry = dentry;
-                exp_put(exp);
                *expp = exp2;
-        } else {
+                exp2 = exp;
-                exp_put(exp2);
-                dput(mounts);
        }
-        mntput(mnt);
+        path_put(&path);
+        exp_put(exp2);
 out:
        return err;
 }
@@ -169,28 +168,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        /* checking mountpoint crossing is very different when stepping up */
                        struct svc_export *exp2 = NULL;
                        struct dentry *dp;
-                        struct vfsmount *mnt = mntget(exp->ex_path.mnt);
+                        struct path path = {.mnt = mntget(exp->ex_path.mnt),
-                        dentry = dget(dparent);
+                                            .dentry = dget(dparent)};
-                        while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
+                        while (path.dentry == path.mnt->mnt_root &&
+                               follow_up(&path))
                                ;
-                        dp = dget_parent(dentry);
+                        dp = dget_parent(path.dentry);
-                        dput(dentry);
+                        dput(path.dentry);
-                        dentry = dp;
+                        path.dentry = dp;
-                        exp2 = rqst_exp_parent(rqstp, mnt, dentry);
+                        exp2 = rqst_exp_parent(rqstp, &path);
                        if (PTR_ERR(exp2) == -ENOENT) {
-                                dput(dentry);
                                dentry = dget(dparent);
                        } else if (IS_ERR(exp2)) {
                                host_err = PTR_ERR(exp2);
-                                dput(dentry);
+                                path_put(&path);
-                                mntput(mnt);
                                goto out_nfserr;
                        } else {
+                                dentry = dget(path.dentry);
                                exp_put(exp);
                                exp = exp2;
                        }
-                        mntput(mnt);
+                        path_put(&path);
                }
        } else {
                fh_lock(fhp);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 300f1cdfa862..cadd36b14d07 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -864,11 +864,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
        case NILFS_CHECKPOINT:
                /*
                 * Check for protecting existing snapshot mounts:
-                 * bd_mount_sem is used to make this operation atomic and
+                 * ns_mount_mutex is used to make this operation atomic and
                 * exclusive with a new mount job.  Though it doesn't cover
                 * umount, it's enough for the purpose.
                 */
-                down(&nilfs->ns_bdev->bd_mount_sem);
+                mutex_lock(&nilfs->ns_mount_mutex);
                if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
                        /* Current implementation does not have to protect
                           plain read-only mounts since they are exclusive
@@ -877,7 +877,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
                        ret = -EBUSY;
                } else
                        ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
-                up(&nilfs->ns_bdev->bd_mount_sem);
+                mutex_unlock(&nilfs->ns_mount_mutex);
                return ret;
        case NILFS_SNAPSHOT:
                return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index adccd4fc654e..0776ccc2504a 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -60,6 +60,7 @@ struct nilfs_sb_info {
        struct super_block *s_super;    /* reverse pointer to super_block */
        struct the_nilfs *s_nilfs;
        struct list_head s_list;        /* list head for nilfs->ns_supers */
+        atomic_t s_count;               /* reference count */
        /* Segment constructor */
        struct list_head s_dirty_files; /* dirty files list */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6989b03e97ab..1777a3467bd2 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -65,9 +65,8 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
+static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
-static int test_exclusive_mount(struct file_system_type *fs_type,
-                                struct block_device *bdev, int flags);
 /**
 * nilfs_error() - report failure condition on a filesystem
@@ -315,6 +314,11 @@ static void nilfs_put_super(struct super_block *sb)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
+        lock_kernel();
+        if (sb->s_dirt)
+                nilfs_write_super(sb);
        nilfs_detach_segment_constructor(sbi);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -323,12 +327,18 @@ static void nilfs_put_super(struct super_block *sb)
                nilfs_commit_super(sbi, 1);
                up_write(&nilfs->ns_sem);
        }
+        down_write(&nilfs->ns_super_sem);
+        if (nilfs->ns_current == sbi)
+                nilfs->ns_current = NULL;
+        up_write(&nilfs->ns_super_sem);
        nilfs_detach_checkpoint(sbi);
        put_nilfs(sbi->s_nilfs);
        sbi->s_super = NULL;
        sb->s_fs_info = NULL;
-        kfree(sbi);
+        nilfs_put_sbinfo(sbi);
+        unlock_kernel();
 }
 /**
@@ -383,6 +393,8 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
        int err = 0;
+        nilfs_write_super(sb);
        /* This function is called when super block should be written back */
        if (wait)
                err = nilfs_construct_segment(sb);
@@ -396,9 +408,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        struct buffer_head *bh_cp;
        int err;
-        down_write(&nilfs->ns_sem);
+        down_write(&nilfs->ns_super_sem);
        list_add(&sbi->s_list, &nilfs->ns_supers);
-        up_write(&nilfs->ns_sem);
+        up_write(&nilfs->ns_super_sem);
        sbi->s_ifile = nilfs_mdt_new(
                nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
@@ -436,9 +448,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
-        down_write(&nilfs->ns_sem);
+        down_write(&nilfs->ns_super_sem);
        list_del_init(&sbi->s_list);
-        up_write(&nilfs->ns_sem);
+        up_write(&nilfs->ns_super_sem);
        return err;
 }
@@ -450,9 +462,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
        nilfs_mdt_clear(sbi->s_ifile);
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
-        down_write(&nilfs->ns_sem);
+        down_write(&nilfs->ns_super_sem);
        list_del_init(&sbi->s_list);
-        up_write(&nilfs->ns_sem);
+        up_write(&nilfs->ns_super_sem);
 }
 static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
@@ -752,7 +764,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 * @silent: silent mode flag
 * @nilfs: the_nilfs struct
 *
- * This function is called exclusively by bd_mount_mutex.
+ * This function is called exclusively by nilfs->ns_mount_mutex.
 * So, the recovery process is protected from other simultaneous mounts.
 */
 static int
@@ -773,6 +785,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        get_nilfs(nilfs);
        sbi->s_nilfs = nilfs;
        sbi->s_super = sb;
+        atomic_set(&sbi->s_count, 1);
        err = init_nilfs(nilfs, sbi, (char *)data);
        if (err)
@@ -870,6 +883,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                goto failed_root;
        }
+        down_write(&nilfs->ns_super_sem);
+        if (!nilfs_test_opt(sbi, SNAPSHOT))
+                nilfs->ns_current = sbi;
+        up_write(&nilfs->ns_super_sem);
        return 0;
 failed_root:
@@ -885,7 +903,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 failed_sbi:
        put_nilfs(nilfs);
        sb->s_fs_info = NULL;
-        kfree(sbi);
+        nilfs_put_sbinfo(sbi);
        return err;
 }
@@ -898,6 +916,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct nilfs_mount_options old_opts;
        int err;
+        lock_kernel();
+        down_write(&nilfs->ns_super_sem);
        old_sb_flags = sb->s_flags;
        old_opts.mount_opt = sbi->s_mount_opt;
        old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -945,14 +966,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                 * store the current valid flag.  (It may have been changed
                 * by fsck since we originally mounted the partition.)
                 */
-                down(&sb->s_bdev->bd_mount_sem);
+                if (nilfs->ns_current && nilfs->ns_current != sbi) {
-                /* Check existing RW-mount */
-                if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because a RW-mount exists.\n",
+                               "remount because an RW-mount exists.\n",
                               sb->s_id);
                        err = -EBUSY;
-                        goto rw_remount_failed;
+                        goto restore_opts;
                }
                if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
                        printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -960,7 +979,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                               "the latest one.\n",
                               sb->s_id);
                        err = -EINVAL;
-                        goto rw_remount_failed;
+                        goto restore_opts;
                }
                sb->s_flags &= ~MS_RDONLY;
                nilfs_clear_opt(sbi, SNAPSHOT);
@@ -968,28 +987,31 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                err = nilfs_attach_segment_constructor(sbi);
                if (err)
-                        goto rw_remount_failed;
+                        goto restore_opts;
                down_write(&nilfs->ns_sem);
                nilfs_setup_super(sbi);
                up_write(&nilfs->ns_sem);
-                up(&sb->s_bdev->bd_mount_sem);
+                nilfs->ns_current = sbi;
        }
 out:
+        up_write(&nilfs->ns_super_sem);
+        unlock_kernel();
        return 0;
- rw_remount_failed:
-        up(&sb->s_bdev->bd_mount_sem);
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.mount_opt;
        sbi->s_snapshot_cno = old_opts.snapshot_cno;
+        up_write(&nilfs->ns_super_sem);
+        unlock_kernel();
        return err;
 }
 struct nilfs_super_data {
        struct block_device *bdev;
+        struct nilfs_sb_info *sbi;
        __u64 cno;
        int flags;
 };
@@ -1048,33 +1070,7 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
        struct nilfs_super_data *sd = data;
-        return s->s_bdev == sd->bdev;
+        return sd->sbi && s->s_fs_info == (void *)sd->sbi;
-}
-static int nilfs_test_bdev_super2(struct super_block *s, void *data)
-{
-        struct nilfs_super_data *sd = data;
-        int ret;
-        if (s->s_bdev != sd->bdev)
-                return 0;
-        if (!((s->s_flags | sd->flags) & MS_RDONLY))
-                return 1; /* Reuse an old R/W-mode super_block */
-        if (s->s_flags & sd->flags & MS_RDONLY) {
-                if (down_read_trylock(&s->s_umount)) {
-                        ret = s->s_root &&
-                                (sd->cno == NILFS_SB(s)->s_snapshot_cno);
-                        up_read(&s->s_umount);
-                        /*
-                         * This path is locked with sb_lock by sget().
-                         * So, drop_super() causes deadlock.
-                         */
-                        return ret;
-                }
-        }
-        return 0;
 }
 static int
@@ -1082,8 +1078,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
             const char *dev_name, void *data, struct vfsmount *mnt)
 {
        struct nilfs_super_data sd;
-        struct super_block *s, *s2;
+        struct super_block *s;
-        struct the_nilfs *nilfs = NULL;
+        struct the_nilfs *nilfs;
        int err, need_to_close = 1;
        sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
@@ -1095,7 +1091,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
         * much more information than normal filesystems to identify mount
         * instance.  For snapshot mounts, not only a mount type (ro-mount
         * or rw-mount) but also a checkpoint number is required.
-         * The results are passed in sget() using nilfs_super_data.
         */
        sd.cno = 0;
        sd.flags = flags;
@@ -1104,64 +1099,59 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                goto failed;
        }
-        /*
+        nilfs = find_or_create_nilfs(sd.bdev);
-         * once the super is inserted into the list by sget, s_umount
+        if (!nilfs) {
-         * will protect the lockfs code from trying to start a snapshot
+                err = -ENOMEM;
-         * while we are mounting
+                goto failed;
-         */
-        down(&sd.bdev->bd_mount_sem);
-        if (!sd.cno &&
-            (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
-                err = (err < 0) ? : -EBUSY;
-                goto failed_unlock;
        }
-        /*
+        mutex_lock(&nilfs->ns_mount_mutex);
-         * Phase-1: search any existent instance and get the_nilfs
-         */
-        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-        if (IS_ERR(s))
-                goto error_s;
-        if (!s->s_root) {
-                err = -ENOMEM;
-                nilfs = alloc_nilfs(sd.bdev);
-                if (!nilfs)
-                        goto cancel_new;
-        } else {
-                struct nilfs_sb_info *sbi = NILFS_SB(s);
+        if (!sd.cno) {
                /*
-                 * s_umount protects super_block from unmount process;
+                 * Check if an exclusive mount exists or not.
-                 * It covers pointers of nilfs_sb_info and the_nilfs.
+                 * Snapshot mounts coexist with a current mount
+                 * (i.e. rw-mount or ro-mount), whereas rw-mount and
+                 * ro-mount are mutually exclusive.
                 */
-                nilfs = sbi->s_nilfs;
+                down_read(&nilfs->ns_super_sem);
-                get_nilfs(nilfs);
+                if (nilfs->ns_current &&
-                up_write(&s->s_umount);
+                    ((nilfs->ns_current->s_super->s_flags ^ flags)
+                     & MS_RDONLY)) {
+                        up_read(&nilfs->ns_super_sem);
+                        err = -EBUSY;
+                        goto failed_unlock;
+                }
+                up_read(&nilfs->ns_super_sem);
+        }
-                /*
+        /*
-                 * Phase-2: search specified snapshot or R/W mode super_block
+         * Find existing nilfs_sb_info struct
-                 */
+         */
-                if (!sd.cno)
+        sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
-                        /* trying to get the latest checkpoint.  */
-                        sd.cno = nilfs_last_cno(nilfs);
-                s2 = sget(fs_type, nilfs_test_bdev_super2,
+        if (!sd.cno)
-                          nilfs_set_bdev_super, &sd);
+                /* trying to get the latest checkpoint.  */
-                deactivate_super(s);
+                sd.cno = nilfs_last_cno(nilfs);
-                /*
-                 * Although deactivate_super() invokes close_bdev_exclusive() at
+        /*
-                 * kill_block_super().  Here, s is an existent mount; we need
+         * Get super block instance holding the nilfs_sb_info struct.
-                 * one more close_bdev_exclusive() call.
+         * A new instance is allocated if no existing mount is present or
-                 */
+         * existing instance has been unmounted.
-                s = s2;
+         */
-                if (IS_ERR(s))
+        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-                        goto error_s;
+        if (sd.sbi)
+                nilfs_put_sbinfo(sd.sbi);
+        if (IS_ERR(s)) {
+                err = PTR_ERR(s);
+                goto failed_unlock;
        }
        if (!s->s_root) {
                char b[BDEVNAME_SIZE];
+                /* New superblock instance created */
                s->s_flags = flags;
                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(sd.bdev));
@@ -1172,26 +1162,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                s->s_flags |= MS_ACTIVE;
                need_to_close = 0;
-        } else if (!(s->s_flags & MS_RDONLY)) {
-                err = -EBUSY;
        }
-        up(&sd.bdev->bd_mount_sem);
+        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
        if (need_to_close)
                close_bdev_exclusive(sd.bdev, flags);
        simple_set_mnt(mnt, s);
        return 0;
- error_s:
-        up(&sd.bdev->bd_mount_sem);
-        if (nilfs)
-                put_nilfs(nilfs);
-        close_bdev_exclusive(sd.bdev, flags);
-        return PTR_ERR(s);
 failed_unlock:
-        up(&sd.bdev->bd_mount_sem);
+        mutex_unlock(&nilfs->ns_mount_mutex);
+        put_nilfs(nilfs);
 failed:
        close_bdev_exclusive(sd.bdev, flags);
@@ -1199,70 +1181,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 cancel_new:
        /* Abandoning the newly allocated superblock */
-        up(&sd.bdev->bd_mount_sem);
+        mutex_unlock(&nilfs->ns_mount_mutex);
-        if (nilfs)
+        put_nilfs(nilfs);
-                put_nilfs(nilfs);
        up_write(&s->s_umount);
        deactivate_super(s);
        /*
         * deactivate_super() invokes close_bdev_exclusive().
         * We must finish all post-cleaning before this call;
-         * put_nilfs() and unlocking bd_mount_sem need the block device.
+         * put_nilfs() needs the block device.
         */
        return err;
 }
-static int nilfs_test_bdev_super3(struct super_block *s, void *data)
-{
-        struct nilfs_super_data *sd = data;
-        int ret;
-        if (s->s_bdev != sd->bdev)
-                return 0;
-        if (down_read_trylock(&s->s_umount)) {
-                ret = (s->s_flags & MS_RDONLY) && s->s_root &&
-                        nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
-                up_read(&s->s_umount);
-                if (ret)
-                        return 0; /* ignore snapshot mounts */
-        }
-        return !((sd->flags ^ s->s_flags) & MS_RDONLY);
-}
-static int __false_bdev_super(struct super_block *s, void *data)
-{
-#if 0 /* XXX: workaround for lock debug. This is not good idea */
-        up_write(&s->s_umount);
-#endif
-        return -EFAULT;
-}
-/**
- * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
- * fs_type: filesystem type
- * bdev: block device
- * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
- * res: pointer to an integer to store result
- *
- * This function must be called within a section protected by bd_mount_mutex.
- */
-static int test_exclusive_mount(struct file_system_type *fs_type,
-                                struct block_device *bdev, int flags)
-{
-        struct super_block *s;
-        struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
-        s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
-        if (IS_ERR(s)) {
-                if (PTR_ERR(s) != -EFAULT)
-                        return PTR_ERR(s);
-                return 0;  /* Not found */
-        }
-        up_write(&s->s_umount);
-        deactivate_super(s);
-        return 1;  /* Found */
-}
 struct file_system_type nilfs_fs_type = {
        .owner    = THIS_MODULE,
        .name     = "nilfs2",
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index a91f15b8673c..e4e5c78bcc93 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,6 +35,10 @@
 #include "seglist.h"
 #include "segbuf.h"
+static LIST_HEAD(nilfs_objects);
+static DEFINE_SPINLOCK(nilfs_lock);
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
                            sector_t start_blocknr, u64 seq, __u64 cno)
 {
@@ -55,7 +59,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
 * Return Value: On success, pointer to the_nilfs is returned.
 * On error, NULL is returned.
 */
-struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
        struct the_nilfs *nilfs;
@@ -68,7 +72,10 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
        atomic_set(&nilfs->ns_writer_refcount, -1);
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
+        init_rwsem(&nilfs->ns_super_sem);
+        mutex_init(&nilfs->ns_mount_mutex);
        mutex_init(&nilfs->ns_writer_mutex);
+        INIT_LIST_HEAD(&nilfs->ns_list);
        INIT_LIST_HEAD(&nilfs->ns_supers);
        spin_lock_init(&nilfs->ns_last_segment_lock);
        nilfs->ns_gc_inodes_h = NULL;
@@ -78,6 +85,45 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 }
 /**
+ * find_or_create_nilfs - find or create nilfs object
+ * @bdev: block device to which the_nilfs is related
+ *
+ * find_nilfs() looks up an existent nilfs object created on the
+ * device and gets the reference count of the object.  If no nilfs object
+ * is found on the device, a new nilfs object is allocated.
+ *
+ * Return Value: On success, pointer to the nilfs object is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
+{
+        struct the_nilfs *nilfs, *new = NULL;
+ retry:
+        spin_lock(&nilfs_lock);
+        list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
+                if (nilfs->ns_bdev == bdev) {
+                        get_nilfs(nilfs);
+                        spin_unlock(&nilfs_lock);
+                        if (new)
+                                put_nilfs(new);
+                        return nilfs; /* existing object */
+                }
+        }
+        if (new) {
+                list_add_tail(&new->ns_list, &nilfs_objects);
+                spin_unlock(&nilfs_lock);
+                return new; /* new object */
+        }
+        spin_unlock(&nilfs_lock);
+        new = alloc_nilfs(bdev);
+        if (new)
+                goto retry;
+        return NULL; /* insufficient memory */
+}
+/**
 * put_nilfs - release a reference to the_nilfs
 * @nilfs: the_nilfs structure to be released
 *
@@ -86,13 +132,20 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 */
 void put_nilfs(struct the_nilfs *nilfs)
 {
-        if (!atomic_dec_and_test(&nilfs->ns_count))
+        spin_lock(&nilfs_lock);
+        if (!atomic_dec_and_test(&nilfs->ns_count)) {
+                spin_unlock(&nilfs_lock);
                return;
+        }
+        list_del_init(&nilfs->ns_list);
+        spin_unlock(&nilfs_lock);
        /*
-         * Increment of ns_count never occur below because the caller
+         * Increment of ns_count never occurs below because the caller
         * of get_nilfs() holds at least one reference to the_nilfs.
         * Thus its exclusion control is not required here.
         */
        might_sleep();
        if (nilfs_loaded(nilfs)) {
                nilfs_mdt_clear(nilfs->ns_sufile);
@@ -613,13 +666,63 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
        return ret;
 }
+/**
+ * nilfs_find_sbinfo - find existing nilfs_sb_info structure
+ * @nilfs: nilfs object
+ * @rw_mount: mount type (non-zero value for read/write mount)
+ * @cno: checkpoint number (zero for read-only mount)
+ *
+ * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
+ * @rw_mount and @cno (in case of snapshots) matched.  If no instance
+ * was found, NULL is returned.  Although the super block instance can
+ * be unmounted after this function returns, the nilfs_sb_info struct
+ * is kept on memory until nilfs_put_sbinfo() is called.
+ */
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
+                                        int rw_mount, __u64 cno)
+{
+        struct nilfs_sb_info *sbi;
+        down_read(&nilfs->ns_super_sem);
+        /*
+         * The SNAPSHOT flag and sb->s_flags are supposed to be
+         * protected with nilfs->ns_super_sem.
+         */
+        sbi = nilfs->ns_current;
+        if (rw_mount) {
+                if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
+                        goto found; /* read/write mount */
+                else
+                        goto out;
+        } else if (cno == 0) {
+                if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
+                        goto found; /* read-only mount */
+                else
+                        goto out;
+        }
+        list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+                if (nilfs_test_opt(sbi, SNAPSHOT) &&
+                    sbi->s_snapshot_cno == cno)
+                        goto found; /* snapshot mount */
+        }
+ out:
+        up_read(&nilfs->ns_super_sem);
+        return NULL;
+ found:
+        atomic_inc(&sbi->s_count);
+        up_read(&nilfs->ns_super_sem);
+        return sbi;
+}
 int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
                                int snapshot_mount)
 {
        struct nilfs_sb_info *sbi;
        int ret = 0;
-        down_read(&nilfs->ns_sem);
+        down_read(&nilfs->ns_super_sem);
        if (cno == 0 || cno > nilfs->ns_cno)
                goto out_unlock;
@@ -636,6 +739,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
                ret++;
 out_unlock:
-        up_read(&nilfs->ns_sem);
+        up_read(&nilfs->ns_super_sem);
        return ret;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 30fe58778d05..e8adbffc626f 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -43,12 +43,16 @@ enum {
 * struct the_nilfs - struct to supervise multiple nilfs mount points
 * @ns_flags: flags
 * @ns_count: reference count
+ * @ns_list: list head for nilfs_list
 * @ns_bdev: block device
 * @ns_bdi: backing dev info
 * @ns_writer: back pointer to writable nilfs_sb_info
 * @ns_sem: semaphore for shared states
+ * @ns_super_sem: semaphore for global operations across super block instances
+ * @ns_mount_mutex: mutex protecting mount process of nilfs
 * @ns_writer_mutex: mutex protecting ns_writer attach/detach
 * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_current: back pointer to current mount
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
 * @ns_sbwtime: previous write time of super blocks
@@ -88,15 +92,24 @@ enum {
 struct the_nilfs {
        unsigned long           ns_flags;
        atomic_t                ns_count;
+        struct list_head        ns_list;
        struct block_device    *ns_bdev;
        struct backing_dev_info *ns_bdi;
        struct nilfs_sb_info   *ns_writer;
        struct rw_semaphore     ns_sem;
+        struct rw_semaphore     ns_super_sem;
+        struct mutex            ns_mount_mutex;
        struct mutex            ns_writer_mutex;
        atomic_t                ns_writer_refcount;
        /*
+         * components protected by ns_super_sem
+         */
+        struct nilfs_sb_info   *ns_current;
+        struct list_head        ns_supers;
+        /*
         * used for
         * - loading the latest checkpoint exclusively.
         * - allocating a new full segment.
@@ -108,7 +121,6 @@ struct the_nilfs {
        time_t                  ns_sbwtime[2];
        unsigned                ns_sbsize;
        unsigned                ns_mount_state;
-        struct list_head        ns_supers;
        /*
         * Following fields are dedicated to a writable FS-instance.
@@ -191,11 +203,12 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 #define NILFS_ALTSB_FREQ        60  /* spare superblock */
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *alloc_nilfs(struct block_device *);
+struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
@@ -238,6 +251,12 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        mutex_unlock(&nilfs->ns_writer_mutex);
 }
+static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
+{
+        if (!atomic_dec_and_test(&sbi->s_count))
+                kfree(sbi);
+}
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
                        sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 50914d7303c6..31dac7e3b0f1 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,2 +1,15 @@
+config FSNOTIFY
+        bool "Filesystem notification backend"
+        default y
+        ---help---
+           fsnotify is a backend for filesystem notification.  fsnotify does
+           not provide any userspace interface but does provide the basis
+           needed for other notification schemes such as dnotify, inotify,
+           and fanotify.
+           Say Y here to enable fsnotify suport.
+           If unsure, say Y.
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 5a95b6010ce7..0922cc826c46 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,2 +1,4 @@
+obj-$(CONFIG_FSNOTIFY)          += fsnotify.o notification.o group.o inode_mark.o
 obj-y                   += dnotify/
 obj-y                   += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 26adf5dfa646..904ff8d5405a 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,5 +1,6 @@
 config DNOTIFY
        bool "Dnotify support"
+        depends on FSNOTIFY
        default y
        help
          Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b0aa2cde80bd..828a889be909 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -3,6 +3,9 @@
 *
 * Copyright (C) 2000,2001,2002 Stephen Rothwell
 *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * dnotify was largly rewritten to use the new fsnotify infrastructure
+ *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
@@ -21,24 +24,173 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
 int dir_notify_enable __read_mostly = 1;
-static struct kmem_cache *dn_cache __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
+static struct fsnotify_group *dnotify_group __read_mostly;
+static DEFINE_MUTEX(dnotify_mark_mutex);
+/*
+ * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * is being watched by dnotify.  If multiple userspace applications are watching
+ * the same directory with dnotify their information is chained in dn
+ */
+struct dnotify_mark_entry {
+        struct fsnotify_mark_entry fsn_entry;
+        struct dnotify_struct *dn;
+};
-static void redo_inode_mask(struct inode *inode)
+/*
+ * When a process starts or stops watching an inode the set of events which
+ * dnotify cares about for that inode may change.  This function runs the
+ * list of everything receiving dnotify events about this directory and calculates
+ * the set of all those events.  After it updates what dnotify is interested in
+ * it calls the fsnotify function so it can update the set of all events relevant
+ * to this inode.
+ */
+static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 {
-        unsigned long new_mask;
+        __u32 new_mask, old_mask;
        struct dnotify_struct *dn;
+        struct dnotify_mark_entry *dnentry  = container_of(entry,
+                                                           struct dnotify_mark_entry,
+                                                           fsn_entry);
+        assert_spin_locked(&entry->lock);
+        old_mask = entry->mask;
        new_mask = 0;
-        for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
+        for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
-                new_mask |= dn->dn_mask & ~DN_MULTISHOT;
+                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-        inode->i_dnotify_mask = new_mask;
+        entry->mask = new_mask;
+        if (old_mask == new_mask)
+                return;
+        if (entry->inode)
+                fsnotify_recalc_inode_mask(entry->inode);
+}
+/*
+ * Mains fsnotify call where events are delivered to dnotify.
+ * Find the dnotify mark on the relevant inode, run the list of dnotify structs
+ * on that mark and determine which of them has expressed interest in receiving
+ * events of this type.  When found send the correct process and signal and
+ * destroy the dnotify struct if it was not registered to receive multiple
+ * events.
+ */
+static int dnotify_handle_event(struct fsnotify_group *group,
+                                struct fsnotify_event *event)
+{
+        struct fsnotify_mark_entry *entry = NULL;
+        struct dnotify_mark_entry *dnentry;
+        struct inode *to_tell;
+        struct dnotify_struct *dn;
+        struct dnotify_struct **prev;
+        struct fown_struct *fown;
+        to_tell = event->to_tell;
+        spin_lock(&to_tell->i_lock);
+        entry = fsnotify_find_mark_entry(group, to_tell);
+        spin_unlock(&to_tell->i_lock);
+        /* unlikely since we alreay passed dnotify_should_send_event() */
+        if (unlikely(!entry))
+                return 0;
+        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+        spin_lock(&entry->lock);
+        prev = &dnentry->dn;
+        while ((dn = *prev) != NULL) {
+                if ((dn->dn_mask & event->mask) == 0) {
+                        prev = &dn->dn_next;
+                        continue;
+                }
+                fown = &dn->dn_filp->f_owner;
+                send_sigio(fown, dn->dn_fd, POLL_MSG);
+                if (dn->dn_mask & FS_DN_MULTISHOT)
+                        prev = &dn->dn_next;
+                else {
+                        *prev = dn->dn_next;
+                        kmem_cache_free(dnotify_struct_cache, dn);
+                        dnotify_recalc_inode_mask(entry);
+                }
+        }
+        spin_unlock(&entry->lock);
+        fsnotify_put_mark(entry);
+        return 0;
+}
+/*
+ * Given an inode and mask determine if dnotify would be interested in sending
+ * userspace notification for that pair.
+ */
+static bool dnotify_should_send_event(struct fsnotify_group *group,
+                                      struct inode *inode, __u32 mask)
+{
+        struct fsnotify_mark_entry *entry;
+        bool send;
+        /* !dir_notify_enable should never get here, don't waste time checking
+        if (!dir_notify_enable)
+                return 0; */
+        /* not a dir, dnotify doesn't care */
+        if (!S_ISDIR(inode->i_mode))
+                return false;
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        /* no mark means no dnotify watch */
+        if (!entry)
+                return false;
+        mask = (mask & ~FS_EVENT_ON_CHILD);
+        send = (mask & entry->mask);
+        fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
+        return send;
+}
+static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+        struct dnotify_mark_entry *dnentry = container_of(entry,
+                                                          struct dnotify_mark_entry,
+                                                          fsn_entry);
+        BUG_ON(dnentry->dn);
+        kmem_cache_free(dnotify_mark_entry_cache, dnentry);
 }
+static struct fsnotify_ops dnotify_fsnotify_ops = {
+        .handle_event = dnotify_handle_event,
+        .should_send_event = dnotify_should_send_event,
+        .free_group_priv = NULL,
+        .freeing_mark = NULL,
+        .free_event_priv = NULL,
+};
+/*
+ * Called every time a file is closed.  Looks first for a dnotify mark on the
+ * inode.  If one is found run all of the ->dn entries attached to that
+ * mark for one relevant to this process closing the file and remove that
+ * dnotify_struct.  If that was the last dnotify_struct also remove the
+ * fsnotify_mark_entry.
+ */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
+        struct fsnotify_mark_entry *entry;
+        struct dnotify_mark_entry *dnentry;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
@@ -46,145 +198,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
        inode = filp->f_path.dentry->d_inode;
        if (!S_ISDIR(inode->i_mode))
                return;
        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
+        entry = fsnotify_find_mark_entry(dnotify_group, inode);
+        spin_unlock(&inode->i_lock);
+        if (!entry)
+                return;
+        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+        mutex_lock(&dnotify_mark_mutex);
+        spin_lock(&entry->lock);
+        prev = &dnentry->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
-                        redo_inode_mask(inode);
+                        kmem_cache_free(dnotify_struct_cache, dn);
-                        kmem_cache_free(dn_cache, dn);
+                        dnotify_recalc_inode_mask(entry);
                        break;
                }
                prev = &dn->dn_next;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&entry->lock);
+        /* nothing else could have found us thanks to the dnotify_mark_mutex */
+        if (dnentry->dn == NULL)
+                fsnotify_destroy_mark_by_entry(entry);
+        fsnotify_recalc_group_mask(dnotify_group);
+        mutex_unlock(&dnotify_mark_mutex);
+        fsnotify_put_mark(entry);
+}
+/* this conversion is done only at watch creation */
+static __u32 convert_arg(unsigned long arg)
+{
+        __u32 new_mask = FS_EVENT_ON_CHILD;
+        if (arg & DN_MULTISHOT)
+                new_mask |= FS_DN_MULTISHOT;
+        if (arg & DN_DELETE)
+                new_mask |= (FS_DELETE | FS_MOVED_FROM);
+        if (arg & DN_MODIFY)
+                new_mask |= FS_MODIFY;
+        if (arg & DN_ACCESS)
+                new_mask |= FS_ACCESS;
+        if (arg & DN_ATTRIB)
+                new_mask |= FS_ATTRIB;
+        if (arg & DN_RENAME)
+                new_mask |= FS_DN_RENAME;
+        if (arg & DN_CREATE)
+                new_mask |= (FS_CREATE | FS_MOVED_TO);
+        return new_mask;
 }
+/*
+ * If multiple processes watch the same inode with dnotify there is only one
+ * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * onto that mark.  This function either attaches the new dnotify_struct onto
+ * that list, or it |= the mask onto an existing dnofiy_struct.
+ */
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
+                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
+{
+        struct dnotify_struct *odn;
+        odn = dnentry->dn;
+        while (odn != NULL) {
+                /* adding more events to existing dnofiy_struct? */
+                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+                        odn->dn_fd = fd;
+                        odn->dn_mask |= mask;
+                        return -EEXIST;
+                }
+                odn = odn->dn_next;
+        }
+        dn->dn_mask = mask;
+        dn->dn_fd = fd;
+        dn->dn_filp = filp;
+        dn->dn_owner = id;
+        dn->dn_next = dnentry->dn;
+        dnentry->dn = dn;
+        return 0;
+}
+/*
+ * When a process calls fcntl to attach a dnotify watch to a directory it ends
+ * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
+ * attached to the fsnotify_mark.
+ */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
+        struct dnotify_mark_entry *new_dnentry, *dnentry;
+        struct fsnotify_mark_entry *new_entry, *entry;
        struct dnotify_struct *dn;
-        struct dnotify_struct *odn;
-        struct dnotify_struct **prev;
        struct inode *inode;
        fl_owner_t id = current->files;
        struct file *f;
-        int error = 0;
+        int destroy = 0, error = 0;
+        __u32 mask;
+        /* we use these to tell if we need to kfree */
+        new_entry = NULL;
+        dn = NULL;
+        if (!dir_notify_enable) {
+                error = -EINVAL;
+                goto out_err;
+        }
+        /* a 0 mask means we are explicitly removing the watch */
        if ((arg & ~DN_MULTISHOT) == 0) {
                dnotify_flush(filp, id);
-                return 0;
+                error = 0;
+                goto out_err;
        }
-        if (!dir_notify_enable)
-                return -EINVAL;
+        /* dnotify only works on directories */
        inode = filp->f_path.dentry->d_inode;
-        if (!S_ISDIR(inode->i_mode))
+        if (!S_ISDIR(inode->i_mode)) {
-                return -ENOTDIR;
+                error = -ENOTDIR;
-        dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
+                goto out_err;
-        if (dn == NULL)
-                return -ENOMEM;
-        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
-        while ((odn = *prev) != NULL) {
-                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-                        odn->dn_fd = fd;
-                        odn->dn_mask |= arg;
-                        inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-                        goto out_free;
-                }
-                prev = &odn->dn_next;
        }
-        rcu_read_lock();
+        /* expect most fcntl to add new rather than augment old */
-        f = fcheck(fd);
+        dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
-        rcu_read_unlock();
+        if (!dn) {
-        /* we'd lost the race with close(), sod off silently */
+                error = -ENOMEM;
-        /* note that inode->i_lock prevents reordering problems
+                goto out_err;
-         * between accesses to descriptor table and ->i_dnotify */
+        }
-        if (f != filp)
-                goto out_free;
-        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
-        if (error)
+        new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
-                goto out_free;
+        if (!new_dnentry) {
+                error = -ENOMEM;
+                goto out_err;
+        }
-        dn->dn_mask = arg;
+        /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
-        dn->dn_fd = fd;
+        mask = convert_arg(arg);
-        dn->dn_filp = filp;
-        dn->dn_owner = id;
-        inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-        dn->dn_next = inode->i_dnotify;
-        inode->i_dnotify = dn;
-        spin_unlock(&inode->i_lock);
-        return 0;
-out_free:
+        /* set up the new_entry and new_dnentry */
-        spin_unlock(&inode->i_lock);
+        new_entry = &new_dnentry->fsn_entry;
-        kmem_cache_free(dn_cache, dn);
+        fsnotify_init_mark(new_entry, dnotify_free_mark);
-        return error;
+        new_entry->mask = mask;
-}
+        new_dnentry->dn = NULL;
-void __inode_dir_notify(struct inode *inode, unsigned long event)
+        /* this is needed to prevent the fcntl/close race described below */
-{
+        mutex_lock(&dnotify_mark_mutex);
-        struct dnotify_struct * dn;
-        struct dnotify_struct **prev;
-        struct fown_struct *    fown;
-        int                     changed = 0;
+        /* add the new_entry or find an old one. */
        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
+        entry = fsnotify_find_mark_entry(dnotify_group, inode);
-        while ((dn = *prev) != NULL) {
-                if ((dn->dn_mask & event) == 0) {
-                        prev = &dn->dn_next;
-                        continue;
-                }
-                fown = &dn->dn_filp->f_owner;
-                send_sigio(fown, dn->dn_fd, POLL_MSG);
-                if (dn->dn_mask & DN_MULTISHOT)
-                        prev = &dn->dn_next;
-                else {
-                        *prev = dn->dn_next;
-                        changed = 1;
-                        kmem_cache_free(dn_cache, dn);
-                }
-        }
-        if (changed)
-                redo_inode_mask(inode);
        spin_unlock(&inode->i_lock);
-}
+        if (entry) {
+                dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-EXPORT_SYMBOL(__inode_dir_notify);
+                spin_lock(&entry->lock);
+        } else {
+                fsnotify_add_mark(new_entry, dnotify_group, inode);
+                spin_lock(&new_entry->lock);
+                entry = new_entry;
+                dnentry = new_dnentry;
+                /* we used new_entry, so don't free it */
+                new_entry = NULL;
+        }
-/*
+        rcu_read_lock();
- * This is hopelessly wrong, but unfixable without API changes.  At
+        f = fcheck(fd);
- * least it doesn't oops the kernel...
+        rcu_read_unlock();
- *
- * To safely access ->d_parent we need to keep d_move away from it.  Use the
- * dentry's d_lock for this.
- */
-void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-        struct dentry *parent;
-        if (!dir_notify_enable)
+        /* if (f != filp) means that we lost a race and another task/thread
-                return;
+         * actually closed the fd we are still playing with before we grabbed
+         * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
+         * only time we clean up the mark entries we need to get our mark off
+         * the list. */
+        if (f != filp) {
+                /* if we added ourselves, shoot ourselves, it's possible that
+                 * the flush actually did shoot this entry.  That's fine too
+                 * since multiple calls to destroy_mark is perfectly safe, if
+                 * we found a dnentry already attached to the inode, just sod
+                 * off silently as the flush at close time dealt with it.
+                 */
+                if (dnentry == new_dnentry)
+                        destroy = 1;
+                goto out;
+        }
-        spin_lock(&dentry->d_lock);
+        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-        parent = dentry->d_parent;
+        if (error) {
-        if (parent->d_inode->i_dnotify_mask & event) {
+                /* if we added, we must shoot */
-                dget(parent);
+                if (dnentry == new_dnentry)
-                spin_unlock(&dentry->d_lock);
+                        destroy = 1;
-                __inode_dir_notify(parent->d_inode, event);
+                goto out;
-                dput(parent);
-        } else {
-                spin_unlock(&dentry->d_lock);
        }
+        error = attach_dn(dn, dnentry, id, fd, filp, mask);
+        /* !error means that we attached the dn to the dnentry, so don't free it */
+        if (!error)
+                dn = NULL;
+        /* -EEXIST means that we didn't add this new dn and used an old one.
+         * that isn't an error (and the unused dn should be freed) */
+        else if (error == -EEXIST)
+                error = 0;
+        dnotify_recalc_inode_mask(entry);
+out:
+        spin_unlock(&entry->lock);
+        if (destroy)
+                fsnotify_destroy_mark_by_entry(entry);
+        fsnotify_recalc_group_mask(dnotify_group);
+        mutex_unlock(&dnotify_mark_mutex);
+        fsnotify_put_mark(entry);
+out_err:
+        if (new_entry)
+                fsnotify_put_mark(new_entry);
+        if (dn)
+                kmem_cache_free(dnotify_struct_cache, dn);
+        return error;
 }
-EXPORT_SYMBOL_GPL(dnotify_parent);
 static int __init dnotify_init(void)
 {
-        dn_cache = kmem_cache_create("dnotify_cache",
+        dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
-                sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
+        dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
+        dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
+                                              0, &dnotify_fsnotify_ops);
+        if (IS_ERR(dnotify_group))
+                panic("unable to allocate fsnotify group for dnotify\n");
        return 0;
 }
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 000000000000..ec2f7bd76818
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,186 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+/*
+ * Clear all of the marks on an inode when it is being evicted from core
+ */
+void __fsnotify_inode_delete(struct inode *inode)
+{
+        fsnotify_clear_marks_by_inode(inode);
+}
+EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+/*
+ * Given an inode, first check if we care what happens to our children.  Inotify
+ * and dnotify both tell their parents about events.  If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+ * parent cares.  Thus when an event happens on a child it can quickly tell if
+ * if there is a need to find a parent and send the event to the parent.
+ */
+void __fsnotify_update_child_dentry_flags(struct inode *inode)
+{
+        struct dentry *alias;
+        int watched;
+        if (!S_ISDIR(inode->i_mode))
+                return;
+        /* determine if the children should tell inode about their events */
+        watched = fsnotify_inode_watches_children(inode);
+        spin_lock(&dcache_lock);
+        /* run all of the dentries associated with this inode.  Since this is a
+         * directory, there damn well better only be one item on this list */
+        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+                struct dentry *child;
+                /* run all of the children of the original inode and fix their
+                 * d_flags to indicate parental interest (their parent is the
+                 * original inode) */
+                list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+                        if (!child->d_inode)
+                                continue;
+                        spin_lock(&child->d_lock);
+                        if (watched)
+                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+                        else
+                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+                        spin_unlock(&child->d_lock);
+                }
+        }
+        spin_unlock(&dcache_lock);
+}
+/* Notify this dentry's parent about a child's events. */
+void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+        struct dentry *parent;
+        struct inode *p_inode;
+        bool send = false;
+        bool should_update_children = false;
+        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+                return;
+        spin_lock(&dentry->d_lock);
+        parent = dentry->d_parent;
+        p_inode = parent->d_inode;
+        if (fsnotify_inode_watches_children(p_inode)) {
+                if (p_inode->i_fsnotify_mask & mask) {
+                        dget(parent);
+                        send = true;
+                }
+        } else {
+                /*
+                 * The parent doesn't care about events on it's children but
+                 * at least one child thought it did.  We need to run all the
+                 * children and update their d_flags to let them know p_inode
+                 * doesn't care about them any more.
+                 */
+                dget(parent);
+                should_update_children = true;
+        }
+        spin_unlock(&dentry->d_lock);
+        if (send) {
+                /* we are notifying a parent so come up with the new mask which
+                 * specifies these are events which came from a child. */
+                mask |= FS_EVENT_ON_CHILD;
+                fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+                         dentry->d_name.name, 0);
+                dput(parent);
+        }
+        if (unlikely(should_update_children)) {
+                __fsnotify_update_child_dentry_flags(p_inode);
+                dput(parent);
+        }
+}
+EXPORT_SYMBOL_GPL(__fsnotify_parent);
+/*
+ * This is the main call to fsnotify.  The VFS calls into hook specific functions
+ * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
+ * out to all of the registered fsnotify_group.  Those groups can then use the
+ * notification event in whatever means they feel necessary.
+ */
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
+{
+        struct fsnotify_group *group;
+        struct fsnotify_event *event = NULL;
+        int idx;
+        /* global tests shouldn't care about events on child only the specific event */
+        __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+        if (list_empty(&fsnotify_groups))
+                return;
+        if (!(test_mask & fsnotify_mask))
+                return;
+        if (!(test_mask & to_tell->i_fsnotify_mask))
+                return;
+        /*
+         * SRCU!!  the groups list is very very much read only and the path is
+         * very hot.  The VAST majority of events are not going to need to do
+         * anything other than walk the list so it's crazy to pre-allocate.
+         */
+        idx = srcu_read_lock(&fsnotify_grp_srcu);
+        list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
+                if (test_mask & group->mask) {
+                        if (!group->ops->should_send_event(group, to_tell, mask))
+                                continue;
+                        if (!event) {
+                                event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
+                                /* shit, we OOM'd and now we can't tell, maybe
+                                 * someday someone else will want to do something
+                                 * here */
+                                if (!event)
+                                        break;
+                        }
+                        group->ops->handle_event(group, event);
+                }
+        }
+        srcu_read_unlock(&fsnotify_grp_srcu, idx);
+        /*
+         * fsnotify_create_event() took a reference so the event can't be cleaned
+         * up while we are still trying to add it to lists, drop that one.
+         */
+        if (event)
+                fsnotify_put_event(event);
+}
+EXPORT_SYMBOL_GPL(fsnotify);
+static __init int fsnotify_init(void)
+{
+        return init_srcu_struct(&fsnotify_grp_srcu);
+}
+subsys_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 000000000000..4dc240824b2d
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,34 @@
+#ifndef __FS_NOTIFY_FSNOTIFY_H_
+#define __FS_NOTIFY_FSNOTIFY_H_
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+/* protects reads of fsnotify_groups */
+extern struct srcu_struct fsnotify_grp_srcu;
+/* all groups which receive fsnotify events */
+extern struct list_head fsnotify_groups;
+/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
+extern __u32 fsnotify_mask;
+/* destroy all events sitting in this groups notification queue */
+extern void fsnotify_flush_notify(struct fsnotify_group *group);
+/* final kfree of a group */
+extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+/* run the list of all marks associated with inode and flag them to be freed */
+extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/*
+ * update the dentry->d_flags of all of inode's children to indicate if inode cares
+ * about events that happen to its children.
+ */
+extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+/* allocate and destroy and event holder to attach events to notification/access queues */
+extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+#endif  /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 000000000000..0e1677144bc5
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,254 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/rculist.h>
+#include <linux/wait.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+#include <asm/atomic.h>
+/* protects writes to fsnotify_groups and fsnotify_mask */
+static DEFINE_MUTEX(fsnotify_grp_mutex);
+/* protects reads while running the fsnotify_groups list */
+struct srcu_struct fsnotify_grp_srcu;
+/* all groups registered to receive filesystem notifications */
+LIST_HEAD(fsnotify_groups);
+/* bitwise OR of all events (FS_*) interesting to some group on this system */
+__u32 fsnotify_mask;
+/*
+ * When a new group registers or changes it's set of interesting events
+ * this function updates the fsnotify_mask to contain all interesting events
+ */
+void fsnotify_recalc_global_mask(void)
+{
+        struct fsnotify_group *group;
+        __u32 mask = 0;
+        int idx;
+        idx = srcu_read_lock(&fsnotify_grp_srcu);
+        list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
+                mask |= group->mask;
+        srcu_read_unlock(&fsnotify_grp_srcu, idx);
+        fsnotify_mask = mask;
+}
+/*
+ * Update the group->mask by running all of the marks associated with this
+ * group and finding the bitwise | of all of the mark->mask.  If we change
+ * the group->mask we need to update the global mask of events interesting
+ * to the system.
+ */
+void fsnotify_recalc_group_mask(struct fsnotify_group *group)
+{
+        __u32 mask = 0;
+        __u32 old_mask = group->mask;
+        struct fsnotify_mark_entry *entry;
+        spin_lock(&group->mark_lock);
+        list_for_each_entry(entry, &group->mark_entries, g_list)
+                mask |= entry->mask;
+        spin_unlock(&group->mark_lock);
+        group->mask = mask;
+        if (old_mask != mask)
+                fsnotify_recalc_global_mask();
+}
+/*
+ * Take a reference to a group so things found under the fsnotify_grp_mutex
+ * can't get freed under us
+ */
+static void fsnotify_get_group(struct fsnotify_group *group)
+{
+        atomic_inc(&group->refcnt);
+}
+/*
+ * Final freeing of a group
+ */
+void fsnotify_final_destroy_group(struct fsnotify_group *group)
+{
+        /* clear the notification queue of all events */
+        fsnotify_flush_notify(group);
+        if (group->ops->free_group_priv)
+                group->ops->free_group_priv(group);
+        kfree(group);
+}
+/*
+ * Trying to get rid of a group.  We need to first get rid of any outstanding
+ * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group
+ * could miss marks that are being freed by inode and those marks could still
+ * hold a reference to this group (via group->num_marks)  If we get into that
+ * situtation, the fsnotify_final_destroy_group will get called when that final
+ * mark is freed.
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+        /* clear all inode mark entries for this group */
+        fsnotify_clear_marks_by_group(group);
+        /* past the point of no return, matches the initial value of 1 */
+        if (atomic_dec_and_test(&group->num_marks))
+                fsnotify_final_destroy_group(group);
+}
+/*
+ * Remove this group from the global list of groups that will get events
+ * this can be done even if there are still references and things still using
+ * this group.  This just stops the group from getting new events.
+ */
+static void __fsnotify_evict_group(struct fsnotify_group *group)
+{
+        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+        if (group->on_group_list)
+                list_del_rcu(&group->group_list);
+        group->on_group_list = 0;
+}
+/*
+ * Called when a group is no longer interested in getting events.  This can be
+ * used if a group is misbehaving or if for some reason a group should no longer
+ * get any filesystem events.
+ */
+void fsnotify_evict_group(struct fsnotify_group *group)
+{
+        mutex_lock(&fsnotify_grp_mutex);
+        __fsnotify_evict_group(group);
+        mutex_unlock(&fsnotify_grp_mutex);
+}
+/*
+ * Drop a reference to a group.  Free it if it's through.
+ */
+void fsnotify_put_group(struct fsnotify_group *group)
+{
+        if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
+                return;
+        /*
+         * OK, now we know that there's no other users *and* we hold mutex,
+         * so no new references will appear
+         */
+        __fsnotify_evict_group(group);
+        /*
+         * now it's off the list, so the only thing we might care about is
+         * srcu access....
+         */
+        mutex_unlock(&fsnotify_grp_mutex);
+        synchronize_srcu(&fsnotify_grp_srcu);
+        /* and now it is really dead. _Nothing_ could be seeing it */
+        fsnotify_recalc_global_mask();
+        fsnotify_destroy_group(group);
+}
+/*
+ * Simply run the fsnotify_groups list and find a group which matches
+ * the given parameters.  If a group is found we take a reference to that
+ * group.
+ */
+static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
+                                                  const struct fsnotify_ops *ops)
+{
+        struct fsnotify_group *group_iter;
+        struct fsnotify_group *group = NULL;
+        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+        list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
+                if (group_iter->group_num == group_num) {
+                        if ((group_iter->mask == mask) &&
+                            (group_iter->ops == ops)) {
+                                fsnotify_get_group(group_iter);
+                                group = group_iter;
+                        } else
+                                group = ERR_PTR(-EEXIST);
+                }
+        }
+        return group;
+}
+/*
+ * Either finds an existing group which matches the group_num, mask, and ops or
+ * creates a new group and adds it to the global group list.  In either case we
+ * take a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
+                                             const struct fsnotify_ops *ops)
+{
+        struct fsnotify_group *group, *tgroup;
+        /* very low use, simpler locking if we just always alloc */
+        group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+        if (!group)
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&group->refcnt, 1);
+        group->on_group_list = 0;
+        group->group_num = group_num;
+        group->mask = mask;
+        mutex_init(&group->notification_mutex);
+        INIT_LIST_HEAD(&group->notification_list);
+        init_waitqueue_head(&group->notification_waitq);
+        group->q_len = 0;
+        group->max_events = UINT_MAX;
+        spin_lock_init(&group->mark_lock);
+        atomic_set(&group->num_marks, 0);
+        INIT_LIST_HEAD(&group->mark_entries);
+        group->ops = ops;
+        mutex_lock(&fsnotify_grp_mutex);
+        tgroup = fsnotify_find_group(group_num, mask, ops);
+        if (tgroup) {
+                /* group already exists */
+                mutex_unlock(&fsnotify_grp_mutex);
+                /* destroy the new one we made */
+                fsnotify_put_group(group);
+                return tgroup;
+        }
+        /* group not found, add a new one */
+        list_add_rcu(&group->group_list, &fsnotify_groups);
+        group->on_group_list = 1;
+        /* being on the fsnotify_groups list holds one num_marks */
+        atomic_inc(&group->num_marks);
+        mutex_unlock(&fsnotify_grp_mutex);
+        if (mask)
+                fsnotify_recalc_global_mask();
+        return group;
+}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 000000000000..c8a07c65482b
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,426 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * entry->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the mark_entries list anchored inside a given group
+ * and each entry is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
+ * given inode and each entry is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
+{
+        atomic_inc(&entry->refcnt);
+}
+void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
+{
+        if (atomic_dec_and_test(&entry->refcnt))
+                entry->free_mark(entry);
+}
+/*
+ * Recalculate the mask of events relevant to a given inode locked.
+ */
+static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry;
+        struct hlist_node *pos;
+        __u32 new_mask = 0;
+        assert_spin_locked(&inode->i_lock);
+        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
+                new_mask |= entry->mask;
+        inode->i_fsnotify_mask = new_mask;
+}
+/*
+ * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this inode.
+ */
+void fsnotify_recalc_inode_mask(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        fsnotify_recalc_inode_mask_locked(inode);
+        spin_unlock(&inode->i_lock);
+        __fsnotify_update_child_dentry_flags(inode);
+}
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the entry->lock
+ */
+void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
+{
+        struct fsnotify_group *group;
+        struct inode *inode;
+        spin_lock(&entry->lock);
+        group = entry->group;
+        inode = entry->inode;
+        BUG_ON(group && !inode);
+        BUG_ON(!group && inode);
+        /* if !group something else already marked this to die */
+        if (!group) {
+                spin_unlock(&entry->lock);
+                return;
+        }
+        /* 1 from caller and 1 for being on i_list/g_list */
+        BUG_ON(atomic_read(&entry->refcnt) < 2);
+        spin_lock(&group->mark_lock);
+        spin_lock(&inode->i_lock);
+        hlist_del_init(&entry->i_list);
+        entry->inode = NULL;
+        list_del_init(&entry->g_list);
+        entry->group = NULL;
+        fsnotify_put_mark(entry); /* for i_list and g_list */
+        /*
+         * this mark is now off the inode->i_fsnotify_mark_entries list and we
+         * hold the inode->i_lock, so this is the perfect time to update the
+         * inode->i_fsnotify_mask
+         */
+        fsnotify_recalc_inode_mask_locked(inode);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&entry->lock);
+        /*
+         * Some groups like to know that marks are being freed.  This is a
+         * callback to the group function to let it know that this entry
+         * is being freed.
+         */
+        if (group->ops->freeing_mark)
+                group->ops->freeing_mark(entry, group);
+        /*
+         * __fsnotify_update_child_dentry_flags(inode);
+         *
+         * I really want to call that, but we can't, we have no idea if the inode
+         * still exists the second we drop the entry->lock.
+         *
+         * The next time an event arrive to this inode from one of it's children
+         * __fsnotify_parent will see that the inode doesn't care about it's
+         * children and will update all of these flags then.  So really this
+         * is just a lazy update (and could be a perf win...)
+         */
+        iput(inode);
+        /*
+         * it's possible that this group tried to destroy itself, but this
+         * this mark was simultaneously being freed by inode.  If that's the
+         * case, we finish freeing the group here.
+         */
+        if (unlikely(atomic_dec_and_test(&group->num_marks)))
+                fsnotify_final_destroy_group(group);
+}
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+        struct fsnotify_mark_entry *lentry, *entry;
+        LIST_HEAD(free_list);
+        spin_lock(&group->mark_lock);
+        list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
+                list_add(&entry->free_g_list, &free_list);
+                list_del_init(&entry->g_list);
+                fsnotify_get_mark(entry);
+        }
+        spin_unlock(&group->mark_lock);
+        list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+}
+/*
+ * Given an inode, destroy all of the marks associated with that inode.
+ */
+void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry, *lentry;
+        struct hlist_node *pos, *n;
+        LIST_HEAD(free_list);
+        spin_lock(&inode->i_lock);
+        hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
+                list_add(&entry->free_i_list, &free_list);
+                hlist_del_init(&entry->i_list);
+                fsnotify_get_mark(entry);
+        }
+        spin_unlock(&inode->i_lock);
+        list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+}
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
+                                                     struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry;
+        struct hlist_node *pos;
+        assert_spin_locked(&inode->i_lock);
+        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+                if (entry->group == group) {
+                        fsnotify_get_mark(entry);
+                        return entry;
+                }
+        }
+        return NULL;
+}
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
+                        void (*free_mark)(struct fsnotify_mark_entry *entry))
+{
+        spin_lock_init(&entry->lock);
+        atomic_set(&entry->refcnt, 1);
+        INIT_HLIST_NODE(&entry->i_list);
+        entry->group = NULL;
+        entry->mask = 0;
+        entry->inode = NULL;
+        entry->free_mark = free_mark;
+}
+/*
+ * Attach an initialized mark entry to a given group and inode.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group and for which inodes.
+ */
+int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
+                      struct fsnotify_group *group, struct inode *inode)
+{
+        struct fsnotify_mark_entry *lentry;
+        int ret = 0;
+        inode = igrab(inode);
+        if (unlikely(!inode))
+                return -EINVAL;
+        /*
+         * LOCKING ORDER!!!!
+         * entry->lock
+         * group->mark_lock
+         * inode->i_lock
+         */
+        spin_lock(&entry->lock);
+        spin_lock(&group->mark_lock);
+        spin_lock(&inode->i_lock);
+        entry->group = group;
+        entry->inode = inode;
+        lentry = fsnotify_find_mark_entry(group, inode);
+        if (!lentry) {
+                hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
+                list_add(&entry->g_list, &group->mark_entries);
+                fsnotify_get_mark(entry); /* for i_list and g_list */
+                atomic_inc(&group->num_marks);
+                fsnotify_recalc_inode_mask_locked(inode);
+        }
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&entry->lock);
+        if (lentry) {
+                ret = -EEXIST;
+                iput(inode);
+                fsnotify_put_mark(lentry);
+        } else {
+                __fsnotify_update_child_dentry_flags(inode);
+        }
+        return ret;
+}
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+        struct inode *inode, *next_i, *need_iput = NULL;
+        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+                struct inode *need_iput_tmp;
+                /*
+                 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+                 * I_WILL_FREE, or I_NEW which is fine because by that point
+                 * the inode cannot have any associated watches.
+                 */
+                if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+                        continue;
+                /*
+                 * If i_count is zero, the inode cannot have any watches and
+                 * doing an __iget/iput with MS_ACTIVE clear would actually
+                 * evict all inodes with zero i_count from icache which is
+                 * unnecessarily violent and may in fact be illegal to do.
+                 */
+                if (!atomic_read(&inode->i_count))
+                        continue;
+                need_iput_tmp = need_iput;
+                need_iput = NULL;
+                /* In case fsnotify_inode_delete() drops a reference. */
+                if (inode != need_iput_tmp)
+                        __iget(inode);
+                else
+                        need_iput_tmp = NULL;
+                /* In case the dropping of a reference would nuke next_i. */
+                if ((&next_i->i_sb_list != list) &&
+                    atomic_read(&next_i->i_count) &&
+                    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+                        __iget(next_i);
+                        need_iput = next_i;
+                }
+                /*
+                 * We can safely drop inode_lock here because we hold
+                 * references on both inode and next_i.  Also no new inodes
+                 * will be added since the umount has begun.  Finally,
+                 * iprune_mutex keeps shrink_icache_memory() away.
+                 */
+                spin_unlock(&inode_lock);
+                if (need_iput_tmp)
+                        iput(need_iput_tmp);
+                /* for each watch, send FS_UNMOUNT and then remove it */
+                fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+                fsnotify_inode_delete(inode);
+                iput(inode);
+                spin_lock(&inode_lock);
+        }
+}
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 446792841023..5356884289a1 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,26 +1,30 @@
 config INOTIFY
        bool "Inotify file change notification support"
-        default y
+        default n
        ---help---
-          Say Y here to enable inotify support.  Inotify is a file change
+          Say Y here to enable legacy in kernel inotify support.  Inotify is a
-          notification system and a replacement for dnotify.  Inotify fixes
+          file change notification system.  It is a replacement for dnotify.
-          numerous shortcomings in dnotify and introduces several new features
+          This option only provides the legacy inotify in kernel API.  There
-          including multiple file events, one-shot support, and unmount
+          are no in tree kernel users of this interface since it is deprecated.
-          notification.
+          You only need this if you are loading an out of tree kernel module
+          that uses inotify.
          For more information, see <file:Documentation/filesystems/inotify.txt>
-          If unsure, say Y.
+          If unsure, say N.
 config INOTIFY_USER
        bool "Inotify support for userspace"
-        depends on INOTIFY
+        depends on FSNOTIFY
        default y
        ---help---
          Say Y here to enable inotify support for userspace, including the
          associated system calls.  Inotify allows monitoring of both files and
          directories via a single open fd.  Events are read from the file
          descriptor, which is also select()- and poll()-able.
+          Inotify fixes numerous shortcomings in dnotify and introduces several
+          new features including multiple file events, one-shot support, and
+          unmount notification.
          For more information, see <file:Documentation/filesystems/inotify.txt>
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index e290f3bb9d8d..943828171362 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INOTIFY)           += inotify.o
-obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
+obj-$(CONFIG_INOTIFY_USER)      += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 220c13f0d73d..40b1cf914ccb 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -32,6 +32,7 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 static atomic_t inotify_cookie;
@@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
 */
 static int __init inotify_setup(void)
 {
+        BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+        BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+        BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+        BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+        BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+        BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+        BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+        BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+        BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+        BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+        BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+        BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+        BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+        BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+        BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+        BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+        BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
        atomic_set(&inotify_cookie, 0);
        return 0;
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 000000000000..ea2605a58b8a
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,21 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/slab.h> /* struct kmem_cache */
+extern struct kmem_cache *event_priv_cachep;
+struct inotify_event_private_data {
+        struct fsnotify_event_private_data fsnotify_event_priv_data;
+        int wd;
+};
+struct inotify_inode_mark_entry {
+        /* fsnotify_mark_entry MUST be the first thing */
+        struct fsnotify_mark_entry fsn_entry;
+        int wd;
+};
+extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 000000000000..7ef75b83247e
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,138 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *      John McCutchan  <ttb@tentacle.dhs.org>
+ *      Robert Love     <rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/path.h> /* struct path */
+#include <linux/slab.h> /* kmem_* */
+#include <linux/types.h>
+#include "inotify.h"
+static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark_entry *ientry;
+        struct inode *to_tell;
+        struct inotify_event_private_data *event_priv;
+        struct fsnotify_event_private_data *fsn_event_priv;
+        int wd, ret;
+        to_tell = event->to_tell;
+        spin_lock(&to_tell->i_lock);
+        entry = fsnotify_find_mark_entry(group, to_tell);
+        spin_unlock(&to_tell->i_lock);
+        /* race with watch removal?  We already passes should_send */
+        if (unlikely(!entry))
+                return 0;
+        ientry = container_of(entry, struct inotify_inode_mark_entry,
+                              fsn_entry);
+        wd = ientry->wd;
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        if (unlikely(!event_priv))
+                return -ENOMEM;
+        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event_priv->group = group;
+        event_priv->wd = wd;
+        ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
+        /* EEXIST is not an error */
+        if (ret == -EEXIST)
+                ret = 0;
+        /* did event_priv get attached? */
+        if (list_empty(&fsn_event_priv->event_list))
+                inotify_free_event_priv(fsn_event_priv);
+        /*
+         * If we hold the entry until after the event is on the queue
+         * IN_IGNORED won't be able to pass this event in the queue
+         */
+        fsnotify_put_mark(entry);
+        return ret;
+}
+static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+        inotify_destroy_mark_entry(entry, group);
+}
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+{
+        struct fsnotify_mark_entry *entry;
+        bool send;
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        if (!entry)
+                return false;
+        mask = (mask & ~FS_EVENT_ON_CHILD);
+        send = (entry->mask & mask);
+        /* find took a reference */
+        fsnotify_put_mark(entry);
+        return send;
+}
+static int idr_callback(int id, void *p, void *data)
+{
+        BUG();
+        return 0;
+}
+static void inotify_free_group_priv(struct fsnotify_group *group)
+{
+        /* ideally the idr is empty and we won't hit the BUG in teh callback */
+        idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+        idr_remove_all(&group->inotify_data.idr);
+        idr_destroy(&group->inotify_data.idr);
+}
+void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+{
+        struct inotify_event_private_data *event_priv;
+        event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
+                                  fsnotify_event_priv_data);
+        kmem_cache_free(event_priv_cachep, event_priv);
+}
+const struct fsnotify_ops inotify_fsnotify_ops = {
+        .handle_event = inotify_handle_event,
+        .should_send_event = inotify_should_send_event,
+        .free_group_priv = inotify_free_group_priv,
+        .free_event_priv = inotify_free_event_priv,
+        .freeing_mark = inotify_freeing_mark,
+};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1634319e2404..982a412ac5bc 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -8,6 +8,9 @@
 * Copyright (C) 2005 John McCutchan
 * Copyright 2006 Hewlett-Packard Development Company, L.P.
 *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
@@ -19,94 +22,48 @@
 * General Public License for more details.
 */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
+#include <linux/fs.h> /* struct inode */
-#include <linux/namei.h>
+#include <linux/fsnotify_backend.h>
-#include <linux/poll.h>
+#include <linux/idr.h>
-#include <linux/init.h>
+#include <linux/init.h> /* module_init */
-#include <linux/list.h>
 #include <linux/inotify.h>
+#include <linux/kernel.h> /* roundup() */
+#include <linux/magic.h> /* superblock magic number */
+#include <linux/mount.h> /* mntget */
+#include <linux/namei.h> /* LOOKUP_FOLLOW */
+#include <linux/path.h> /* struct path */
+#include <linux/sched.h> /* struct user */
+#include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
-#include <linux/magic.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
-#include <asm/ioctls.h>
+#include "inotify.h"
-static struct kmem_cache *watch_cachep __read_mostly;
+#include <asm/ioctls.h>
-static struct kmem_cache *event_cachep __read_mostly;
 static struct vfsmount *inotify_mnt __read_mostly;
+/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
+static struct inotify_event nul_inotify_event;
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 static int inotify_max_queued_events __read_mostly;
+int inotify_max_user_watches __read_mostly;
-/*
+static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
- * Lock ordering:
+struct kmem_cache *event_priv_cachep __read_mostly;
- *
+static struct fsnotify_event *inotify_ignored_event;
- * inotify_dev->up_mutex (ensures we don't re-add the same watch)
- *      inode->inotify_mutex (protects inode's watch list)
- *              inotify_handle->mutex (protects inotify_handle's watch list)
- *                      inotify_dev->ev_mutex (protects device's event queue)
- */
 /*
- * Lifetimes of the main data structures:
+ * When inotify registers a new group it increments this and uses that
- *
+ * value as an offset to set the fsnotify group "name" and priority.
- * inotify_device: Lifetime is managed by reference count, from
- * sys_inotify_init() until release.  Additional references can bump the count
- * via get_inotify_dev() and drop the count via put_inotify_dev().
- *
- * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
- * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
- * first event, or to inotify_destroy().
 */
+static atomic_t inotify_grp_num;
-/*
- * struct inotify_device - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_device {
-        wait_queue_head_t       wq;             /* wait queue for i/o */
-        struct mutex            ev_mutex;       /* protects event queue */
-        struct mutex            up_mutex;       /* synchronizes watch updates */
-        struct list_head        events;         /* list of queued events */
-        struct user_struct      *user;          /* user who opened this dev */
-        struct inotify_handle   *ih;            /* inotify handle */
-        struct fasync_struct    *fa;            /* async notification */
-        atomic_t                count;          /* reference count */
-        unsigned int            queue_size;     /* size of the queue (bytes) */
-        unsigned int            event_count;    /* number of pending events */
-        unsigned int            max_events;     /* maximum number of events */
-};
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->ev_mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-        struct inotify_event    event;  /* the user-space event */
-        struct list_head        list;   /* entry in inotify_device's list */
-        char                    *name;  /* filename, if any */
-};
-/*
- * struct inotify_user_watch - our version of an inotify_watch, we add
- * a reference to the associated inotify_device.
- */
-struct inotify_user_watch {
-        struct inotify_device   *dev;   /* associated device */
-        struct inotify_watch    wdata;  /* inotify watch data */
-};
 #ifdef CONFIG_SYSCTL
@@ -149,280 +106,36 @@ ctl_table inotify_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
-static inline void get_inotify_dev(struct inotify_device *dev)
+static inline __u32 inotify_arg_to_mask(u32 arg)
-{
-        atomic_inc(&dev->count);
-}
-static inline void put_inotify_dev(struct inotify_device *dev)
-{
-        if (atomic_dec_and_test(&dev->count)) {
-                atomic_dec(&dev->user->inotify_devs);
-                free_uid(dev->user);
-                kfree(dev);
-        }
-}
-/*
- * free_inotify_user_watch - cleans up the watch and its references
- */
-static void free_inotify_user_watch(struct inotify_watch *w)
-{
-        struct inotify_user_watch *watch;
-        struct inotify_device *dev;
-        watch = container_of(w, struct inotify_user_watch, wdata);
-        dev = watch->dev;
-        atomic_dec(&dev->user->inotify_watches);
-        put_inotify_dev(dev);
-        kmem_cache_free(watch_cachep, watch);
-}
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-                                                  const char *name)
-{
-        struct inotify_kernel_event *kevent;
-        kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
-        if (unlikely(!kevent))
-                return NULL;
-        /* we hand this out to user-space, so zero it just in case */
-        memset(&kevent->event, 0, sizeof(struct inotify_event));
-        kevent->event.wd = wd;
-        kevent->event.mask = mask;
-        kevent->event.cookie = cookie;
-        INIT_LIST_HEAD(&kevent->list);
-        if (name) {
-                size_t len, rem, event_size = sizeof(struct inotify_event);
-                /*
-                 * We need to pad the filename so as to properly align an
-                 * array of inotify_event structures.  Because the structure is
-                 * small and the common case is a small filename, we just round
-                 * up to the next multiple of the structure's sizeof.  This is
-                 * simple and safe for all architectures.
-                 */
-                len = strlen(name) + 1;
-                rem = event_size - len;
-                if (len > event_size) {
-                        rem = event_size - (len % event_size);
-                        if (len % event_size == 0)
-                                rem = 0;
-                }
-                kevent->name = kmalloc(len + rem, GFP_NOFS);
-                if (unlikely(!kevent->name)) {
-                        kmem_cache_free(event_cachep, kevent);
-                        return NULL;
-                }
-                memcpy(kevent->name, name, len);
-                if (rem)
-                        memset(kevent->name + len, 0, rem);
-                kevent->event.len = len + rem;
-        } else {
-                kevent->event.len = 0;
-                kevent->name = NULL;
-        }
-        return kevent;
-}
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-        return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-/*
- * inotify_dev_get_last_event - return the last event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_last_event(struct inotify_device *dev)
 {
-        if (list_empty(&dev->events))
+        __u32 mask;
-                return NULL;
-        return list_entry(dev->events.prev, struct inotify_kernel_event, list);
-}
-/*
+        /* everything should accept their own ignored and cares about children */
- * inotify_dev_queue_event - event handler registered with core inotify, adds
+        mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
- * a new event to the given device
- *
- * Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-                                    u32 cookie, const char *name,
-                                    struct inode *ignored)
-{
-        struct inotify_user_watch *watch;
-        struct inotify_device *dev;
-        struct inotify_kernel_event *kevent, *last;
-        watch = container_of(w, struct inotify_user_watch, wdata);
+        /* mask off the flags used to open the fd */
-        dev = watch->dev;
+        mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
-        mutex_lock(&dev->ev_mutex);
+        return mask;
-        /* we can safely put the watch as we don't reference it while
-         * generating the event
-         */
-        if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
-                put_inotify_watch(w); /* final put */
-        /* coalescing: drop this event if it is a dupe of the previous */
-        last = inotify_dev_get_last_event(dev);
-        if (last && last->event.mask == mask && last->event.wd == wd &&
-                        last->event.cookie == cookie) {
-                const char *lastname = last->name;
-                if (!name && !lastname)
-                        goto out;
-                if (name && lastname && !strcmp(lastname, name))
-                        goto out;
-        }
-        /* the queue overflowed and we already sent the Q_OVERFLOW event */
-        if (unlikely(dev->event_count > dev->max_events))
-                goto out;
-        /* if the queue overflows, we need to notify user space */
-        if (unlikely(dev->event_count == dev->max_events))
-                kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-        else
-                kevent = kernel_event(wd, mask, cookie, name);
-        if (unlikely(!kevent))
-                goto out;
-        /* queue the event and wake up anyone waiting */
-        dev->event_count++;
-        dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-        list_add_tail(&kevent->list, &dev->events);
-        wake_up_interruptible(&dev->wq);
-        kill_fasync(&dev->fa, SIGIO, POLL_IN);
-out:
-        mutex_unlock(&dev->ev_mutex);
-}
-/*
- * remove_kevent - cleans up the given kevent
- *
- * Caller must hold dev->ev_mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-                          struct inotify_kernel_event *kevent)
-{
-        list_del(&kevent->list);
-        dev->event_count--;
-        dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-}
-/*
- * free_kevent - frees the given kevent.
- */
-static void free_kevent(struct inotify_kernel_event *kevent)
-{
-        kfree(kevent->name);
-        kmem_cache_free(event_cachep, kevent);
-}
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->ev_mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-        if (!list_empty(&dev->events)) {
-                struct inotify_kernel_event *kevent;
-                kevent = inotify_dev_get_event(dev);
-                remove_kevent(dev, kevent);
-                free_kevent(kevent);
-        }
-}
-/*
- * find_inode - resolve a user-given path to a specific inode
- */
-static int find_inode(const char __user *dirname, struct path *path,
-                      unsigned flags)
-{
-        int error;
-        error = user_path_at(AT_FDCWD, dirname, flags, path);
-        if (error)
-                return error;
-        /* you can only watch an inode if you have read permissions on it */
-        error = inode_permission(path->dentry->d_inode, MAY_READ);
-        if (error)
-                path_put(path);
-        return error;
 }
-/*
+static inline u32 inotify_mask_to_arg(__u32 mask)
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->up_mutex.
- */
-static int create_watch(struct inotify_device *dev, struct inode *inode,
-                        u32 mask)
 {
-        struct inotify_user_watch *watch;
+        return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
-        int ret;
+                       IN_Q_OVERFLOW);
-        if (atomic_read(&dev->user->inotify_watches) >=
-                        inotify_max_user_watches)
-                return -ENOSPC;
-        watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-        if (unlikely(!watch))
-                return -ENOMEM;
-        /* save a reference to device and bump the count to make it official */
-        get_inotify_dev(dev);
-        watch->dev = dev;
-        atomic_inc(&dev->user->inotify_watches);
-        inotify_init_watch(&watch->wdata);
-        ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
-        if (ret < 0)
-                free_inotify_user_watch(&watch->wdata);
-        return ret;
 }
-/* Device Interface */
+/* intofiy userspace file descriptor functions */
 static unsigned int inotify_poll(struct file *file, poll_table *wait)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
        int ret = 0;
-        poll_wait(file, &dev->wq, wait);
+        poll_wait(file, &group->notification_waitq, wait);
-        mutex_lock(&dev->ev_mutex);
+        mutex_lock(&group->notification_mutex);
-        if (!list_empty(&dev->events))
+        if (!fsnotify_notify_queue_is_empty(group))
                ret = POLLIN | POLLRDNORM;
-        mutex_unlock(&dev->ev_mutex);
+        mutex_unlock(&group->notification_mutex);
        return ret;
 }
@@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 * enough to fit in "count". Return an error pointer if
 * not large enough.
 *
- * Called with the device ev_mutex held.
+ * Called with the group->notification_mutex held.
 */
-static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
-                                                  size_t count)
+                                            size_t count)
 {
        size_t event_size = sizeof(struct inotify_event);
-        struct inotify_kernel_event *kevent;
+        struct fsnotify_event *event;
-        if (list_empty(&dev->events))
+        if (fsnotify_notify_queue_is_empty(group))
                return NULL;
-        kevent = inotify_dev_get_event(dev);
+        event = fsnotify_peek_notify_event(group);
-        if (kevent->name)
-                event_size += kevent->event.len;
+        event_size += roundup(event->name_len, event_size);
        if (event_size > count)
                return ERR_PTR(-EINVAL);
-        remove_kevent(dev, kevent);
+        /* held the notification_mutex the whole time, so this is the
-        return kevent;
+         * same event we peeked above */
+        fsnotify_remove_notify_event(group);
+        return event;
 }
 /*
@@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
 * We already checked that the event size is smaller than the
 * buffer we had in "get_one_event()" above.
 */
-static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+                                  struct fsnotify_event *event,
                                  char __user *buf)
 {
+        struct inotify_event inotify_event;
+        struct fsnotify_event_private_data *fsn_priv;
+        struct inotify_event_private_data *priv;
        size_t event_size = sizeof(struct inotify_event);
+        size_t name_len;
+        /* we get the inotify watch descriptor from the event private data */
+        spin_lock(&event->lock);
+        fsn_priv = fsnotify_remove_priv_from_event(group, event);
+        spin_unlock(&event->lock);
+        if (!fsn_priv)
+                inotify_event.wd = -1;
+        else {
+                priv = container_of(fsn_priv, struct inotify_event_private_data,
+                                    fsnotify_event_priv_data);
+                inotify_event.wd = priv->wd;
+                inotify_free_event_priv(fsn_priv);
+        }
+        /* round up event->name_len so it is a multiple of event_size */
+        name_len = roundup(event->name_len, event_size);
+        inotify_event.len = name_len;
+        inotify_event.mask = inotify_mask_to_arg(event->mask);
+        inotify_event.cookie = event->sync_cookie;
-        if (copy_to_user(buf, &kevent->event, event_size))
+        /* send the main event */
+        if (copy_to_user(buf, &inotify_event, event_size))
                return -EFAULT;
-        if (kevent->name) {
+        buf += event_size;
-                buf += event_size;
-                if (copy_to_user(buf, kevent->name, kevent->event.len))
+        /*
+         * fsnotify only stores the pathname, so here we have to send the pathname
+         * and then pad that pathname out to a multiple of sizeof(inotify_event)
+         * with zeros.  I get my zeros from the nul_inotify_event.
+         */
+        if (name_len) {
+                unsigned int len_to_zero = name_len - event->name_len;
+                /* copy the path name */
+                if (copy_to_user(buf, event->file_name, event->name_len))
                        return -EFAULT;
+                buf += event->name_len;
-                event_size += kevent->event.len;
+                /* fill userspace with 0's from nul_inotify_event */
+                if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+                        return -EFAULT;
+                buf += len_to_zero;
+                event_size += name_len;
        }
        return event_size;
 }
 static ssize_t inotify_read(struct file *file, char __user *buf,
                            size_t count, loff_t *pos)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
+        struct fsnotify_event *kevent;
        char __user *start;
        int ret;
        DEFINE_WAIT(wait);
        start = buf;
-        dev = file->private_data;
+        group = file->private_data;
        while (1) {
-                struct inotify_kernel_event *kevent;
+                prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
-                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+                mutex_lock(&group->notification_mutex);
+                kevent = get_one_event(group, count);
-                mutex_lock(&dev->ev_mutex);
+                mutex_unlock(&group->notification_mutex);
-                kevent = get_one_event(dev, count);
-                mutex_unlock(&dev->ev_mutex);
                if (kevent) {
                        ret = PTR_ERR(kevent);
                        if (IS_ERR(kevent))
                                break;
-                        ret = copy_event_to_user(kevent, buf);
+                        ret = copy_event_to_user(group, kevent, buf);
-                        free_kevent(kevent);
+                        fsnotify_put_event(kevent);
                        if (ret < 0)
                                break;
                        buf += ret;
@@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                schedule();
        }
-        finish_wait(&dev->wq, &wait);
+        finish_wait(&group->notification_waitq, &wait);
        if (start != buf && ret != -EFAULT)
                ret = buf - start;
        return ret;
@@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 static int inotify_fasync(int fd, struct file *file, int on)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
-        return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
+        return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
 }
 static int inotify_release(struct inode *ignored, struct file *file)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
-        inotify_destroy(dev->ih);
-        /* destroy all of the events on this device */
+        fsnotify_clear_marks_by_group(group);
-        mutex_lock(&dev->ev_mutex);
-        while (!list_empty(&dev->events))
-                inotify_dev_event_dequeue(dev);
-        mutex_unlock(&dev->ev_mutex);
-        /* free this device: the put matching the get in inotify_init() */
+        /* free this group, matching get was inotify_init->fsnotify_obtain_group */
-        put_inotify_dev(dev);
+        fsnotify_put_group(group);
        return 0;
 }
@@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
 static long inotify_ioctl(struct file *file, unsigned int cmd,
                          unsigned long arg)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
+        struct fsnotify_event_holder *holder;
+        struct fsnotify_event *event;
        void __user *p;
        int ret = -ENOTTY;
+        size_t send_len = 0;
-        dev = file->private_data;
+        group = file->private_data;
        p = (void __user *) arg;
        switch (cmd) {
        case FIONREAD:
-                ret = put_user(dev->queue_size, (int __user *) p);
+                mutex_lock(&group->notification_mutex);
+                list_for_each_entry(holder, &group->notification_list, event_list) {
+                        event = holder->event;
+                        send_len += sizeof(struct inotify_event);
+                        send_len += roundup(event->name_len,
+                                             sizeof(struct inotify_event));
+                }
+                mutex_unlock(&group->notification_mutex);
+                ret = put_user(send_len, (int __user *) p);
                break;
        }
@@ -576,23 +336,233 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 static const struct file_operations inotify_fops = {
-        .poll           = inotify_poll,
+        .poll           = inotify_poll,
-        .read           = inotify_read,
+        .read           = inotify_read,
-        .fasync         = inotify_fasync,
+        .fasync         = inotify_fasync,
-        .release        = inotify_release,
+        .release        = inotify_release,
-        .unlocked_ioctl = inotify_ioctl,
+        .unlocked_ioctl = inotify_ioctl,
        .compat_ioctl   = inotify_ioctl,
 };
-static const struct inotify_operations inotify_user_ops = {
-        .handle_event   = inotify_dev_queue_event,
-        .destroy_watch  = free_inotify_user_watch,
-};
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+{
+        int error;
+        error = user_path_at(AT_FDCWD, dirname, flags, path);
+        if (error)
+                return error;
+        /* you can only watch an inode if you have read permissions on it */
+        error = inode_permission(path->dentry->d_inode, MAY_READ);
+        if (error)
+                path_put(path);
+        return error;
+}
+/*
+ * When, for whatever reason, inotify is done with a mark (or what used to be a
+ * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
+ * for the given wd.
+ *
+ * There is a bit of recursion here.  The loop looks like:
+ *      inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
+ *      inotify_freeing_mark -> inotify_destory_mark_entry -> restart
+ * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
+ * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
+ * test below will not call back to fsnotify again.  But even if that test wasn't
+ * there this would still be safe since fsnotify_destroy_mark_by_entry() is
+ * safe from recursion.
+ */
+void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+        struct inotify_inode_mark_entry *ientry;
+        struct inotify_event_private_data *event_priv;
+        struct fsnotify_event_private_data *fsn_event_priv;
+        struct fsnotify_group *egroup;
+        struct idr *idr;
+        spin_lock(&entry->lock);
+        egroup = entry->group;
+        /* if egroup we aren't really done and something might still send events
+         * for this inode, on the callback we'll send the IN_IGNORED */
+        if (egroup) {
+                spin_unlock(&entry->lock);
+                fsnotify_destroy_mark_by_entry(entry);
+                return;
+        }
+        spin_unlock(&entry->lock);
+        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        if (unlikely(!event_priv))
+                goto skip_send_ignore;
+        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event_priv->group = group;
+        event_priv->wd = ientry->wd;
+        fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+        /* did the private data get added? */
+        if (list_empty(&fsn_event_priv->event_list))
+                inotify_free_event_priv(fsn_event_priv);
+skip_send_ignore:
+        /* remove this entry from the idr */
+        spin_lock(&group->inotify_data.idr_lock);
+        idr = &group->inotify_data.idr;
+        idr_remove(idr, ientry->wd);
+        spin_unlock(&group->inotify_data.idr_lock);
+        /* removed from idr, drop that reference */
+        fsnotify_put_mark(entry);
+}
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+        struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+        kmem_cache_free(inotify_inode_mark_cachep, ientry);
+}
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+        struct fsnotify_mark_entry *entry = NULL;
+        struct inotify_inode_mark_entry *ientry;
+        int ret = 0;
+        int add = (arg & IN_MASK_ADD);
+        __u32 mask;
+        __u32 old_mask, new_mask;
+        /* don't allow invalid bits: we don't want flags set */
+        mask = inotify_arg_to_mask(arg);
+        if (unlikely(!mask))
+                return -EINVAL;
+        ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+        if (unlikely(!ientry))
+                return -ENOMEM;
+        /* we set the mask at the end after attaching it */
+        fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
+        ientry->wd = 0;
+find_entry:
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        if (entry) {
+                kmem_cache_free(inotify_inode_mark_cachep, ientry);
+                ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        } else {
+                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
+                        ret = -ENOSPC;
+                        goto out_err;
+                }
+                ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
+                if (ret == -EEXIST)
+                        goto find_entry;
+                else if (ret)
+                        goto out_err;
+                entry = &ientry->fsn_entry;
+retry:
+                ret = -ENOMEM;
+                if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+                        goto out_err;
+                spin_lock(&group->inotify_data.idr_lock);
+                /* if entry is added to the idr we keep the reference obtained
+                 * through fsnotify_mark_add.  remember to drop this reference
+                 * when entry is removed from idr */
+                ret = idr_get_new_above(&group->inotify_data.idr, entry,
+                                        ++group->inotify_data.last_wd,
+                                        &ientry->wd);
+                spin_unlock(&group->inotify_data.idr_lock);
+                if (ret) {
+                        if (ret == -EAGAIN)
+                                goto retry;
+                        goto out_err;
+                }
+                atomic_inc(&group->inotify_data.user->inotify_watches);
+        }
+        spin_lock(&entry->lock);
+        old_mask = entry->mask;
+        if (add) {
+                entry->mask |= mask;
+                new_mask = entry->mask;
+        } else {
+                entry->mask = mask;
+                new_mask = entry->mask;
+        }
+        spin_unlock(&entry->lock);
+        if (old_mask != new_mask) {
+                /* more bits in old than in new? */
+                int dropped = (old_mask & ~new_mask);
+                /* more bits in this entry than the inode's mask? */
+                int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+                /* more bits in this entry than the group? */
+                int do_group = (new_mask & ~group->mask);
+                /* update the inode with this new entry */
+                if (dropped || do_inode)
+                        fsnotify_recalc_inode_mask(inode);
+                /* update the group mask with the new mask */
+                if (dropped || do_group)
+                        fsnotify_recalc_group_mask(group);
+        }
+        return ientry->wd;
+out_err:
+        /* see this isn't supposed to happen, just kill the watch */
+        if (entry) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+        return ret;
+}
+static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
+{
+        struct fsnotify_group *group;
+        unsigned int grp_num;
+        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+        grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
+        group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
+        if (IS_ERR(group))
+                return group;
+        group->max_events = max_events;
+        spin_lock_init(&group->inotify_data.idr_lock);
+        idr_init(&group->inotify_data.idr);
+        group->inotify_data.last_wd = 0;
+        group->inotify_data.user = user;
+        group->inotify_data.fa = NULL;
+        return group;
+}
+/* inotify syscalls */
 SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
-        struct inotify_handle *ih;
        struct user_struct *user;
        struct file *filp;
        int fd, ret;
@@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
                goto out_free_uid;
        }
-        dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-        if (unlikely(!dev)) {
+        group = inotify_new_group(user, inotify_max_queued_events);
-                ret = -ENOMEM;
+        if (IS_ERR(group)) {
+                ret = PTR_ERR(group);
                goto out_free_uid;
        }
-        ih = inotify_init(&inotify_user_ops);
-        if (IS_ERR(ih)) {
-                ret = PTR_ERR(ih);
-                goto out_free_dev;
-        }
-        dev->ih = ih;
-        dev->fa = NULL;
        filp->f_op = &inotify_fops;
        filp->f_path.mnt = mntget(inotify_mnt);
        filp->f_path.dentry = dget(inotify_mnt->mnt_root);
        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
        filp->f_mode = FMODE_READ;
        filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-        filp->private_data = dev;
+        filp->private_data = group;
-        INIT_LIST_HEAD(&dev->events);
-        init_waitqueue_head(&dev->wq);
-        mutex_init(&dev->ev_mutex);
-        mutex_init(&dev->up_mutex);
-        dev->event_count = 0;
-        dev->queue_size = 0;
-        dev->max_events = inotify_max_queued_events;
-        dev->user = user;
-        atomic_set(&dev->count, 0);
-        get_inotify_dev(dev);
        atomic_inc(&user->inotify_devs);
        fd_install(fd, filp);
        return fd;
-out_free_dev:
-        kfree(dev);
 out_free_uid:
        free_uid(user);
        put_filp(filp);
@@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init)
 SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
                u32, mask)
 {
+        struct fsnotify_group *group;
        struct inode *inode;
-        struct inotify_device *dev;
        struct path path;
        struct file *filp;
        int ret, fput_needed;
@@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
        if (mask & IN_ONLYDIR)
                flags |= LOOKUP_DIRECTORY;
-        ret = find_inode(pathname, &path, flags);
+        ret = inotify_find_inode(pathname, &path, flags);
-        if (unlikely(ret))
+        if (ret)
                goto fput_and_out;
-        /* inode held in place by reference to path; dev by fget on fd */
+        /* inode held in place by reference to path; group by fget on fd */
        inode = path.dentry->d_inode;
-        dev = filp->private_data;
+        group = filp->private_data;
-        mutex_lock(&dev->up_mutex);
+        /* create/update an inode mark */
-        ret = inotify_find_update_watch(dev->ih, inode, mask);
+        ret = inotify_update_watch(group, inode, mask);
-        if (ret == -ENOENT)
+        if (unlikely(ret))
-                ret = create_watch(dev, inode, mask);
+                goto path_put_and_out;
-        mutex_unlock(&dev->up_mutex);
+path_put_and_out:
        path_put(&path);
 fput_and_out:
        fput_light(filp, fput_needed);
@@ -720,9 +672,10 @@ fput_and_out:
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
+        struct fsnotify_group *group;
+        struct fsnotify_mark_entry *entry;
        struct file *filp;
-        struct inotify_device *dev;
+        int ret = 0, fput_needed;
-        int ret, fput_needed;
        filp = fget_light(fd, &fput_needed);
        if (unlikely(!filp))
@@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
                goto out;
        }
-        dev = filp->private_data;
+        group = filp->private_data;
-        /* we free our watch data when we get IN_IGNORED */
+        spin_lock(&group->inotify_data.idr_lock);
-        ret = inotify_rm_wd(dev->ih, wd);
+        entry = idr_find(&group->inotify_data.idr, wd);
+        if (unlikely(!entry)) {
+                spin_unlock(&group->inotify_data.idr_lock);
+                ret = -EINVAL;
+                goto out;
+        }
+        fsnotify_get_mark(entry);
+        spin_unlock(&group->inotify_data.idr_lock);
+        inotify_destroy_mark_entry(entry, group);
+        fsnotify_put_mark(entry);
 out:
        fput_light(filp, fput_needed);
@@ -753,9 +716,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
 }
 static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
+    .name       = "inotifyfs",
-    .get_sb         = inotify_get_sb,
+    .get_sb     = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
+    .kill_sb    = kill_anon_super,
 };
 /*
@@ -775,18 +738,16 @@ static int __init inotify_user_setup(void)
        if (IS_ERR(inotify_mnt))
                panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
+        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
+        inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+        if (!inotify_ignored_event)
+                panic("unable to allocate the inotify ignored event\n");
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
        inotify_max_user_watches = 8192;
-        watch_cachep = kmem_cache_create("inotify_watch_cache",
-                                         sizeof(struct inotify_user_watch),
-                                         0, SLAB_PANIC, NULL);
-        event_cachep = kmem_cache_create("inotify_event_cache",
-                                         sizeof(struct inotify_kernel_event),
-                                         0, SLAB_PANIC, NULL);
        return 0;
 }
 module_init(inotify_user_setup);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 000000000000..959b73e756fd
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,411 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Basic idea behind the notification queue: An fsnotify group (like inotify)
+ * sends the userspace notification about events asyncronously some time after
+ * the event happened.  When inotify gets an event it will need to add that
+ * event to the group notify queue.  Since a single event might need to be on
+ * multiple group's notification queues we can't add the event directly to each
+ * queue and instead add a small "event_holder" to each queue.  This event_holder
+ * has a pointer back to the original event.  Since the majority of events are
+ * going to end up on one, and only one, notification queue we embed one
+ * event_holder into each event.  This means we have a single allocation instead
+ * of always needing two.  If the embedded event_holder is already in use by
+ * another group a new event_holder (from fsnotify_event_holder_cachep) will be
+ * allocated and used.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+static struct kmem_cache *fsnotify_event_cachep;
+static struct kmem_cache *fsnotify_event_holder_cachep;
+/*
+ * This is a magic event we send when the q is too full.  Since it doesn't
+ * hold real event information we just keep one system wide and use it any time
+ * it is needed.  It's refcnt is set 1 at kernel init time and will never
+ * get set to 0 so it will never get 'freed'
+ */
+static struct fsnotify_event q_overflow_event;
+static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
+/**
+ * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
+ * Called from fsnotify_move, which is inlined into filesystem modules.
+ */
+u32 fsnotify_get_cookie(void)
+{
+        return atomic_inc_return(&fsnotify_sync_cookie);
+}
+EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
+/* return true if the notify queue is empty, false otherwise */
+bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        return list_empty(&group->notification_list) ? true : false;
+}
+void fsnotify_get_event(struct fsnotify_event *event)
+{
+        atomic_inc(&event->refcnt);
+}
+void fsnotify_put_event(struct fsnotify_event *event)
+{
+        if (!event)
+                return;
+        if (atomic_dec_and_test(&event->refcnt)) {
+                if (event->data_type == FSNOTIFY_EVENT_PATH)
+                        path_put(&event->path);
+                BUG_ON(!list_empty(&event->private_data_list));
+                kfree(event->file_name);
+                kmem_cache_free(fsnotify_event_cachep, event);
+        }
+}
+struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
+{
+        return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
+}
+void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
+{
+        kmem_cache_free(fsnotify_event_holder_cachep, holder);
+}
+/*
+ * Find the private data that the group previously attached to this event when
+ * the group added the event to the notification queue (fsnotify_add_notify_event)
+ */
+struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+        struct fsnotify_event_private_data *lpriv;
+        struct fsnotify_event_private_data *priv = NULL;
+        assert_spin_locked(&event->lock);
+        list_for_each_entry(lpriv, &event->private_data_list, event_list) {
+                if (lpriv->group == group) {
+                        priv = lpriv;
+                        list_del(&priv->event_list);
+                        break;
+                }
+        }
+        return priv;
+}
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+        if ((old->mask == new->mask) &&
+            (old->to_tell == new->to_tell) &&
+            (old->data_type == new->data_type)) {
+                switch (old->data_type) {
+                case (FSNOTIFY_EVENT_INODE):
+                        if (old->inode == new->inode)
+                                return true;
+                        break;
+                case (FSNOTIFY_EVENT_PATH):
+                        if ((old->path.mnt == new->path.mnt) &&
+                            (old->path.dentry == new->path.dentry))
+                                return true;
+                case (FSNOTIFY_EVENT_NONE):
+                        return true;
+                };
+        }
+        return false;
+}
+/*
+ * Add an event to the group notification queue.  The group can later pull this
+ * event off the queue to deal with.  If the event is successfully added to the
+ * group's notification queue, a reference is taken on event.
+ */
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+                              struct fsnotify_event_private_data *priv)
+{
+        struct fsnotify_event_holder *holder = NULL;
+        struct list_head *list = &group->notification_list;
+        struct fsnotify_event_holder *last_holder;
+        struct fsnotify_event *last_event;
+        /* easy to tell if priv was attached to the event */
+        INIT_LIST_HEAD(&priv->event_list);
+        /*
+         * There is one fsnotify_event_holder embedded inside each fsnotify_event.
+         * Check if we expect to be able to use that holder.  If not alloc a new
+         * holder.
+         * For the overflow event it's possible that something will use the in
+         * event holder before we get the lock so we may need to jump back and
+         * alloc a new holder, this can't happen for most events...
+         */
+        if (!list_empty(&event->holder.event_list)) {
+alloc_holder:
+                holder = fsnotify_alloc_event_holder();
+                if (!holder)
+                        return -ENOMEM;
+        }
+        mutex_lock(&group->notification_mutex);
+        if (group->q_len >= group->max_events) {
+                event = &q_overflow_event;
+                /* sorry, no private data on the overflow event */
+                priv = NULL;
+        }
+        spin_lock(&event->lock);
+        if (list_empty(&event->holder.event_list)) {
+                if (unlikely(holder))
+                        fsnotify_destroy_event_holder(holder);
+                holder = &event->holder;
+        } else if (unlikely(!holder)) {
+                /* between the time we checked above and got the lock the in
+                 * event holder was used, go back and get a new one */
+                spin_unlock(&event->lock);
+                mutex_unlock(&group->notification_mutex);
+                goto alloc_holder;
+        }
+        if (!list_empty(list)) {
+                last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+                last_event = last_holder->event;
+                if (event_compare(last_event, event)) {
+                        spin_unlock(&event->lock);
+                        mutex_unlock(&group->notification_mutex);
+                        if (holder != &event->holder)
+                                fsnotify_destroy_event_holder(holder);
+                        return -EEXIST;
+                }
+        }
+        group->q_len++;
+        holder->event = event;
+        fsnotify_get_event(event);
+        list_add_tail(&holder->event_list, list);
+        if (priv)
+                list_add_tail(&priv->event_list, &event->private_data_list);
+        spin_unlock(&event->lock);
+        mutex_unlock(&group->notification_mutex);
+        wake_up(&group->notification_waitq);
+        return 0;
+}
+/*
+ * Remove and return the first event from the notification list.  There is a
+ * reference held on this event since it was on the list.  It is the responsibility
+ * of the caller to drop this reference.
+ */
+struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_holder *holder;
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = holder->event;
+        spin_lock(&event->lock);
+        holder->event = NULL;
+        list_del_init(&holder->event_list);
+        spin_unlock(&event->lock);
+        /* event == holder means we are referenced through the in event holder */
+        if (holder != &event->holder)
+                fsnotify_destroy_event_holder(holder);
+        group->q_len--;
+        return event;
+}
+/*
+ * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ */
+struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_holder *holder;
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = holder->event;
+        return event;
+}
+/*
+ * Called when a group is being torn down to clean up any outstanding
+ * event notifications.
+ */
+void fsnotify_flush_notify(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_private_data *priv;
+        mutex_lock(&group->notification_mutex);
+        while (!fsnotify_notify_queue_is_empty(group)) {
+                event = fsnotify_remove_notify_event(group);
+                /* if they don't implement free_event_priv they better not have attached any */
+                if (group->ops->free_event_priv) {
+                        spin_lock(&event->lock);
+                        priv = fsnotify_remove_priv_from_event(group, event);
+                        spin_unlock(&event->lock);
+                        if (priv)
+                                group->ops->free_event_priv(priv);
+                }
+                fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+        }
+        mutex_unlock(&group->notification_mutex);
+}
+static void initialize_event(struct fsnotify_event *event)
+{
+        event->holder.event = NULL;
+        INIT_LIST_HEAD(&event->holder.event_list);
+        atomic_set(&event->refcnt, 1);
+        spin_lock_init(&event->lock);
+        event->path.dentry = NULL;
+        event->path.mnt = NULL;
+        event->inode = NULL;
+        event->data_type = FSNOTIFY_EVENT_NONE;
+        INIT_LIST_HEAD(&event->private_data_list);
+        event->to_tell = NULL;
+        event->file_name = NULL;
+        event->name_len = 0;
+        event->sync_cookie = 0;
+}
+/*
+ * fsnotify_create_event - Allocate a new event which will be sent to each
+ * group's handle_event function if the group was interested in this
+ * particular event.
+ *
+ * @to_tell the inode which is supposed to receive the event (sometimes a
+ *      parent of the inode to which the event happened.
+ * @mask what actually happened.
+ * @data pointer to the object which was actually affected
+ * @data_type flag indication if the data is a file, path, inode, nothing...
+ * @name the filename, if available
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+                                             int data_type, const char *name, u32 cookie)
+{
+        struct fsnotify_event *event;
+        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+        if (!event)
+                return NULL;
+        initialize_event(event);
+        if (name) {
+                event->file_name = kstrdup(name, GFP_KERNEL);
+                if (!event->file_name) {
+                        kmem_cache_free(fsnotify_event_cachep, event);
+                        return NULL;
+                }
+                event->name_len = strlen(event->file_name);
+        }
+        event->sync_cookie = cookie;
+        event->to_tell = to_tell;
+        switch (data_type) {
+        case FSNOTIFY_EVENT_FILE: {
+                struct file *file = data;
+                struct path *path = &file->f_path;
+                event->path.dentry = path->dentry;
+                event->path.mnt = path->mnt;
+                path_get(&event->path);
+                event->data_type = FSNOTIFY_EVENT_PATH;
+                break;
+        }
+        case FSNOTIFY_EVENT_PATH: {
+                struct path *path = data;
+                event->path.dentry = path->dentry;
+                event->path.mnt = path->mnt;
+                path_get(&event->path);
+                event->data_type = FSNOTIFY_EVENT_PATH;
+                break;
+        }
+        case FSNOTIFY_EVENT_INODE:
+                event->inode = data;
+                event->data_type = FSNOTIFY_EVENT_INODE;
+                break;
+        case FSNOTIFY_EVENT_NONE:
+                event->inode = NULL;
+                event->path.dentry = NULL;
+                event->path.mnt = NULL;
+                break;
+        default:
+                BUG();
+        }
+        event->mask = mask;
+        return event;
+}
+__init int fsnotify_notification_init(void)
+{
+        fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+        fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
+        initialize_event(&q_overflow_event);
+        q_overflow_event.mask = FS_Q_OVERFLOW;
+        return 0;
+}
+subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6aa7c4713536..abaaa1cbf8de 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -443,6 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        ntfs_volume *vol = NTFS_SB(sb);
        ntfs_debug("Entering with remount options string: %s", opt);
+        lock_kernel();
 #ifndef NTFS_RW
        /* For read-only compiled driver, enforce read-only flag. */
        *flags |= MS_RDONLY;
@@ -466,15 +468,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                if (NVolErrors(vol)) {
                        ntfs_error(sb, "Volume has errors and is read-only%s",
                                        es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_IS_DIRTY) {
                        ntfs_error(sb, "Volume is dirty and read-only%s", es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
                        ntfs_error(sb, "Volume has been modified by chkdsk "
                                        "and is read-only%s", es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -482,11 +487,13 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                                        "(0x%x) and is read-only%s",
                                        (unsigned)le16_to_cpu(vol->vol_flags),
                                        es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
                        ntfs_error(sb, "Failed to set dirty bit in volume "
                                        "information flags%s", es);
+                        unlock_kernel();
                        return -EROFS;
                }
 #if 0
@@ -506,18 +513,21 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                        ntfs_error(sb, "Failed to empty journal $LogFile%s",
                                        es);
                        NVolSetErrors(vol);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_mark_quotas_out_of_date(vol)) {
                        ntfs_error(sb, "Failed to mark quotas out of date%s",
                                        es);
                        NVolSetErrors(vol);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_stamp_usnjrnl(vol)) {
                        ntfs_error(sb, "Failed to stamp transation log "
                                        "($UsnJrnl)%s", es);
                        NVolSetErrors(vol);
+                        unlock_kernel();
                        return -EROFS;
                }
        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -533,8 +543,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        // TODO: Deal with *flags.
-        if (!parse_options(vol, opt))
+        if (!parse_options(vol, opt)) {
+                unlock_kernel();
                return -EINVAL;
+        }
+        unlock_kernel();
        ntfs_debug("Done.");
        return 0;
 }
@@ -2246,6 +2259,9 @@ static void ntfs_put_super(struct super_block *sb)
        ntfs_volume *vol = NTFS_SB(sb);
        ntfs_debug("Entering.");
+        lock_kernel();
 #ifdef NTFS_RW
        /*
         * Commit all inodes while they are still open in case some of them
@@ -2373,39 +2389,12 @@ static void ntfs_put_super(struct super_block *sb)
                vol->mftmirr_ino = NULL;
        }
        /*
-         * If any dirty inodes are left, throw away all mft data page cache
+         * We should have no dirty inodes left, due to
-         * pages to allow a clean umount.  This should never happen any more
+         * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
-         * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+         * the underlying mft records are written out and cleaned.
-         * the underlying mft records are written out and cleaned.  If it does,
-         * happen anyway, we want to know...
         */
        ntfs_commit_inode(vol->mft_ino);
        write_inode_now(vol->mft_ino, 1);
-        if (sb_has_dirty_inodes(sb)) {
-                const char *s1, *s2;
-                mutex_lock(&vol->mft_ino->i_mutex);
-                truncate_inode_pages(vol->mft_ino->i_mapping, 0);
-                mutex_unlock(&vol->mft_ino->i_mutex);
-                write_inode_now(vol->mft_ino, 1);
-                if (sb_has_dirty_inodes(sb)) {
-                        static const char *_s1 = "inodes";
-                        static const char *_s2 = "";
-                        s1 = _s1;
-                        s2 = _s2;
-                } else {
-                        static const char *_s1 = "mft pages";
-                        static const char *_s2 = "They have been thrown "
-                                        "away.  ";
-                        s1 = _s1;
-                        s2 = _s2;
-                }
-                ntfs_error(sb, "Dirty %s found at umount time.  %sYou should "
-                                "run chkdsk.  Please email "
-                                "linux-ntfs-dev@lists.sourceforge.net and say "
-                                "that you saw this message.  Thank you.", s1,
-                                s2);
-        }
 #endif /* NTFS_RW */
        iput(vol->mft_ino);
@@ -2444,7 +2433,8 @@ static void ntfs_put_super(struct super_block *sb)
        }
        sb->s_fs_info = NULL;
        kfree(vol);
-        return;
+        unlock_kernel();
 }
 /**
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5c6163f55039..201b40a441fe 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -126,7 +127,6 @@ static int ocfs2_get_sector(struct super_block *sb,
                            struct buffer_head **bh,
                            int block,
                            int sect_size);
-static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
@@ -141,7 +141,6 @@ static const struct super_operations ocfs2_sops = {
        .clear_inode    = ocfs2_clear_inode,
        .delete_inode   = ocfs2_delete_inode,
        .sync_fs        = ocfs2_sync_fs,
-        .write_super    = ocfs2_write_super,
        .put_super      = ocfs2_put_super,
        .remount_fs     = ocfs2_remount,
        .show_options   = ocfs2_show_options,
@@ -365,24 +364,12 @@ static struct file_operations ocfs2_osb_debug_fops = {
        .llseek =       generic_file_llseek,
 };
-/*
- * write_super and sync_fs ripped right out of ext3.
- */
-static void ocfs2_write_super(struct super_block *sb)
-{
-        if (mutex_trylock(&sb->s_lock) != 0)
-                BUG();
-        sb->s_dirt = 0;
-}
 static int ocfs2_sync_fs(struct super_block *sb, int wait)
 {
        int status;
        tid_t target;
        struct ocfs2_super *osb = OCFS2_SB(sb);
-        sb->s_dirt = 0;
        if (ocfs2_is_hard_readonly(osb))
                return -EROFS;
@@ -595,6 +582,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        struct mount_options parsed_options;
        struct ocfs2_super *osb = OCFS2_SB(sb);
+        lock_kernel();
        if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
                ret = -EINVAL;
                goto out;
@@ -698,6 +687,7 @@ unlock_osb:
                        ocfs2_set_journal_params(osb);
        }
 out:
+        unlock_kernel();
        return ret;
 }
@@ -1550,9 +1540,13 @@ static void ocfs2_put_super(struct super_block *sb)
 {
        mlog_entry("(0x%p)\n", sb);
+        lock_kernel();
        ocfs2_sync_blockdev(sb);
        ocfs2_dismount_volume(sb, 0);
+        unlock_kernel();
        mlog_exit_void();
 }
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 834b2331f6b3..d17e774eaf45 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -11,21 +11,6 @@
 #include <linux/mpage.h>
 #include "omfs.h"
-static int omfs_sync_file(struct file *file, struct dentry *dentry,
-                int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        err |= omfs_sync_inode(inode);
-        return err ? -EIO : 0;
-}
 static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
 {
        return (sbi->s_sys_blocksize - offset -
@@ -344,7 +329,7 @@ struct file_operations omfs_file_operations = {
        .aio_read = generic_file_aio_read,
        .aio_write = generic_file_aio_write,
        .mmap = generic_file_mmap,
-        .fsync = omfs_sync_file,
+        .fsync = simple_fsync,
        .splice_read = generic_file_splice_read,
 };
diff --git a/fs/open.c b/fs/open.c
index bdfbf03615a4..7200e23d9258 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
        audit_inode(NULL, dentry);
-        err = mnt_want_write(file->f_path.mnt);
+        err = mnt_want_write_file(file);
        if (err)
                goto out_putf;
        mutex_lock(&inode->i_mutex);
@@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
        if (!file)
                goto out;
-        error = mnt_want_write(file->f_path.mnt);
+        error = mnt_want_write_file(file);
        if (error)
                goto out_fput;
        dentry = file->f_path.dentry;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f6db9618a888..753ca37002c8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,3 +92,28 @@ struct pde_opener {
        struct list_head lh;
 };
 void pde_users_dec(struct proc_dir_entry *pde);
+extern spinlock_t proc_subdir_lock;
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+unsigned long task_vsize(struct mm_struct *);
+int task_statm(struct mm_struct *, int *, int *, int *, int *);
+void task_mem(struct seq_file *, struct mm_struct *);
+struct proc_dir_entry *de_get(struct proc_dir_entry *de);
+void de_put(struct proc_dir_entry *de);
+extern struct vfsmount *proc_mnt;
+int proc_fill_super(struct super_block *);
+struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+/*
+ * These are generic /proc routines that use the internal
+ * "struct proc_dir_entry" tree to traverse the filesystem.
+ *
+ * The /proc root directory has extended versions to take care
+ * of the /proc/<pid> subdirectories.
+ */
+int proc_readdir(struct file *, void *, filldir_t);
+struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de2bba5a3440..fc6c3025befd 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 #ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index 502d7fe98bab..e4d408cc5473 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
 obj-$(CONFIG_QNX4FS_FS) += qnx4.o
-qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o
+qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 8425cf6e9624..e1cd061a25f7 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,14 +13,9 @@
 * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
 */
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include "qnx4.h"
 #if 0
 int qnx4_new_block(struct super_block *sb)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ea9ffefb48ad..003c68f3238b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,14 +11,9 @@
 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
 */
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include "qnx4.h"
 static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
@@ -84,7 +79,7 @@ const struct file_operations qnx4_dir_operations =
 {
        .read           = generic_read_dir,
        .readdir        = qnx4_readdir,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
 };
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 867f42b02035..09b170ac936c 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -12,8 +12,7 @@
 * 27-06-1998 by Frank Denis : file overwriting.
 */
-#include <linux/fs.h>
+#include "qnx4.h"
-#include <linux/qnx4_fs.h>
 /*
 * We have mostly NULL's here: the current defaults are ok for
@@ -29,7 +28,7 @@ const struct file_operations qnx4_file_operations =
 #ifdef CONFIG_QNX4FS_RW
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
-        .fsync          = qnx4_sync_file,
+        .fsync          = simple_fsync,
 #endif
 };
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
deleted file mode 100644
index aa3b19544bee..000000000000
--- a/fs/qnx4/fsync.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* 
- * QNX4 file system, Linux implementation.
- * 
- * Version : 0.1
- * 
- * Using parts of the xiafs filesystem.
- * 
- * History :
- * 
- * 24-03-1998 by Richard Frowijn : first release.
- */
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <asm/system.h>
-/*
- * The functions for qnx4 fs file synchronization.
- */
-#ifdef CONFIG_QNX4FS_RW
-static int sync_block(struct inode *inode, unsigned short *block, int wait)
-{
-        struct buffer_head *bh;
-        unsigned short tmp;
-        if (!*block)
-                return 0;
-        tmp = *block;
-        bh = sb_find_get_block(inode->i_sb, *block);
-        if (!bh)
-                return 0;
-        if (*block != tmp) {
-                brelse(bh);
-                return 1;
-        }
-        if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
-                brelse(bh);
-                return -1;
-        }
-        if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
-                brelse(bh);
-                return 0;
-        }
-        ll_rw_block(WRITE, 1, &bh);
-        atomic_dec(&bh->b_count);
-        return 0;
-}
-#ifdef WTF
-static int sync_iblock(struct inode *inode, unsigned short *iblock,
-                       struct buffer_head **bh, int wait)
-{
-        int rc;
-        unsigned short tmp;
-        *bh = NULL;
-        tmp = *iblock;
-        if (!tmp)
-                return 0;
-        rc = sync_block(inode, iblock, wait);
-        if (rc)
-                return rc;
-        *bh = sb_bread(inode->i_sb, tmp);
-        if (tmp != *iblock) {
-                brelse(*bh);
-                *bh = NULL;
-                return 1;
-        }
-        if (!*bh)
-                return -1;
-        return 0;
-}
-#endif
-static int sync_direct(struct inode *inode, int wait)
-{
-        int i;
-        int rc, err = 0;
-        for (i = 0; i < 7; i++) {
-                rc = sync_block(inode,
-                                (unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait);
-                if (rc > 0)
-                        break;
-                if (rc)
-                        err = rc;
-        }
-        return err;
-}
-#ifdef WTF
-static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait)
-{
-        int i;
-        struct buffer_head *ind_bh;
-        int rc, err = 0;
-        rc = sync_iblock(inode, iblock, &ind_bh, wait);
-        if (rc || !ind_bh)
-                return rc;
-        for (i = 0; i < 512; i++) {
-                rc = sync_block(inode,
-                                ((unsigned short *) ind_bh->b_data) + i,
-                                wait);
-                if (rc > 0)
-                        break;
-                if (rc)
-                        err = rc;
-        }
-        brelse(ind_bh);
-        return err;
-}
-static int sync_dindirect(struct inode *inode, unsigned short *diblock,
-                          int wait)
-{
-        int i;
-        struct buffer_head *dind_bh;
-        int rc, err = 0;
-        rc = sync_iblock(inode, diblock, &dind_bh, wait);
-        if (rc || !dind_bh)
-                return rc;
-        for (i = 0; i < 512; i++) {
-                rc = sync_indirect(inode,
-                                ((unsigned short *) dind_bh->b_data) + i,
-                                   wait);
-                if (rc > 0)
-                        break;
-                if (rc)
-                        err = rc;
-        }
-        brelse(dind_bh);
-        return err;
-}
-#endif
-int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused)
-{
-        struct inode *inode = dentry->d_inode;
-        int wait, err = 0;
-        
-        (void) file;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-              S_ISLNK(inode->i_mode)))
-                return -EINVAL;
-        lock_kernel();
-        for (wait = 0; wait <= 1; wait++) {
-                err |= sync_direct(inode, wait);
-        }
-        err |= qnx4_sync_inode(inode);
-        unlock_kernel();
-        return (err < 0) ? -EIO : 0;
-}
-#endif
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fe1f0f31d11c..681df5fcd161 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -13,19 +13,15 @@
 */
 #include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/vfs.h>
+#include <linux/writeback.h>
-#include <asm/uaccess.h>
+#include <linux/statfs.h>
+#include "qnx4.h"
 #define QNX4_VERSION  4
 #define QNX4_BMNAME   ".bitmap"
@@ -34,31 +30,6 @@ static const struct super_operations qnx4_sops;
 #ifdef CONFIG_QNX4FS_RW
-int qnx4_sync_inode(struct inode *inode)
-{
-        int err = 0;
-# if 0
-        struct buffer_head *bh;
-        bh = qnx4_update_inode(inode);
-        if (bh && buffer_dirty(bh))
-        {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh))
-                {
-                        printk ("IO error syncing qnx4 inode [%s:%08lx]\n",
-                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
-                }
-                brelse (bh);
-        } else if (!bh) {
-                err = -1;
-        }
-# endif
-        return err;
-}
 static void qnx4_delete_inode(struct inode *inode)
 {
        QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
@@ -70,15 +41,7 @@ static void qnx4_delete_inode(struct inode *inode)
        unlock_kernel();
 }
-static void qnx4_write_super(struct super_block *sb)
+static int qnx4_write_inode(struct inode *inode, int do_sync)
-{
-        lock_kernel();
-        QNX4DEBUG(("qnx4: write_super\n"));
-        sb->s_dirt = 0;
-        unlock_kernel();
-}
-static int qnx4_write_inode(struct inode *inode, int unused)
 {
        struct qnx4_inode_entry *raw_inode;
        int block, ino;
@@ -115,6 +78,16 @@ static int qnx4_write_inode(struct inode *inode, int unused)
        raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
        raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
        mark_buffer_dirty(bh);
+        if (do_sync) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        printk("qnx4: IO error syncing inode [%s:%08x]\n",
+                                        inode->i_sb->s_id, ino);
+                        brelse(bh);
+                        unlock_kernel();
+                        return -EIO;
+                }
+        }
        brelse(bh);
        unlock_kernel();
        return 0;
@@ -138,7 +111,6 @@ static const struct super_operations qnx4_sops =
 #ifdef CONFIG_QNX4FS_RW
        .write_inode    = qnx4_write_inode,
        .delete_inode   = qnx4_delete_inode,
-        .write_super    = qnx4_write_super,
 #endif
 };
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 775eed3a4085..5972ed214937 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,16 +12,9 @@
 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
 */
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include "qnx4.h"
 /*
@@ -187,7 +180,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
        de->di_status = 0;
        memset(de->di_fname, 0, sizeof de->di_fname);
        de->di_mode = 0;
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        clear_nlink(inode);
        mark_inode_dirty(inode);
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -232,7 +225,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
        de->di_status = 0;
        memset(de->di_fname, 0, sizeof de->di_fname);
        de->di_mode = 0;
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        mark_inode_dirty(dir);
        inode->i_ctime = dir->i_ctime;
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
new file mode 100644
index 000000000000..9efc089454f6
--- /dev/null
+++ b/fs/qnx4/qnx4.h
@@ -0,0 +1,57 @@
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+#define QNX4_DEBUG 0
+#if QNX4_DEBUG
+#define QNX4DEBUG(X) printk X
+#else
+#define QNX4DEBUG(X) (void) 0
+#endif
+struct qnx4_sb_info {
+        struct buffer_head      *sb_buf;        /* superblock buffer */
+        struct qnx4_super_block *sb;            /* our superblock */
+        unsigned int            Version;        /* may be useful */
+        struct qnx4_inode_entry *BitMap;        /* useful */
+};
+struct qnx4_inode_info {
+        struct qnx4_inode_entry raw;
+        loff_t mmu_private;
+        struct inode vfs_inode;
+};
+extern struct inode *qnx4_iget(struct super_block *, unsigned long);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
+extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
+extern struct buffer_head *qnx4_bread(struct inode *, int, int);
+extern const struct inode_operations qnx4_file_inode_operations;
+extern const struct inode_operations qnx4_dir_inode_operations;
+extern const struct file_operations qnx4_file_operations;
+extern const struct file_operations qnx4_dir_operations;
+extern int qnx4_is_free(struct super_block *sb, long block);
+extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
+extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
+extern void qnx4_truncate(struct inode *inode);
+extern void qnx4_free_inode(struct inode *inode);
+extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
+extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
+static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
+{
+        return container_of(inode, struct qnx4_inode_info, vfs_inode);
+}
+static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
+{
+        return &qnx4_i(inode)->raw;
+}
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 6437c1c3d1dd..d94d9ee241fe 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,12 +10,8 @@
 * 30-06-1998 by Frank DENIS : ugly filler.
 */
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/smp_lock.h>
-#include <asm/uaccess.h>
+#include "qnx4.h"
 #ifdef CONFIG_QNX4FS_RW
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b7f5a468f076..95c5b42384b2 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -159,10 +159,14 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
        return error;
 }
-static void quota_sync_sb(struct super_block *sb, int type)
+#ifdef CONFIG_QUOTA
+void sync_quota_sb(struct super_block *sb, int type)
 {
        int cnt;
+        if (!sb->s_qcop->quota_sync)
+                return;
        sb->s_qcop->quota_sync(sb, type);
        if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
@@ -191,17 +195,13 @@ static void quota_sync_sb(struct super_block *sb, int type)
        }
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 }
+#endif
-void sync_dquots(struct super_block *sb, int type)
+static void sync_dquots(int type)
 {
+        struct super_block *sb;
        int cnt;
-        if (sb) {
-                if (sb->s_qcop->quota_sync)
-                        quota_sync_sb(sb, type);
-                return;
-        }
        spin_lock(&sb_lock);
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
@@ -222,8 +222,8 @@ restart:
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
-                if (sb->s_root && sb->s_qcop->quota_sync)
+                if (sb->s_root)
-                        quota_sync_sb(sb, type);
+                        sync_quota_sb(sb, type);
                up_read(&sb->s_umount);
                spin_lock(&sb_lock);
                if (__put_super_and_need_restart(sb))
@@ -301,7 +301,10 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                        return sb->s_qcop->set_dqblk(sb, type, id, &idq);
                }
                case Q_SYNC:
-                        sync_dquots(sb, type);
+                        if (sb)
+                                sync_quota_sb(sb, type);
+                        else
+                                sync_dquots(type);
                        return 0;
                case Q_XQUOTAON:
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 45ee3d357c70..6d2668fdc384 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -44,13 +44,11 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 static inline bool is_privroot_deh(struct dentry *dir,
                                   struct reiserfs_de_head *deh)
 {
-        int ret = 0;
-#ifdef CONFIG_REISERFS_FS_XATTR
        struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-        ret = (dir == dir->d_parent && privroot->d_inode &&
+        if (reiserfs_expose_privroot(dir->d_sb))
-               deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+                return 0;
-#endif
+        return (dir == dir->d_parent && privroot->d_inode &&
-        return ret;
+                deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3567fb9e3fb1..2969773cfc22 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,6 +28,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 struct file_system_type reiserfs_fs_type;
@@ -64,18 +65,15 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
-        if (!(s->s_flags & MS_RDONLY)) {
+        struct reiserfs_transaction_handle th;
-                struct reiserfs_transaction_handle th;
-                reiserfs_write_lock(s);
+        reiserfs_write_lock(s);
-                if (!journal_begin(&th, s, 1))
+        if (!journal_begin(&th, s, 1))
-                        if (!journal_end_sync(&th, s, 1))
+                if (!journal_end_sync(&th, s, 1))
-                                reiserfs_flush_old_commits(s);
+                        reiserfs_flush_old_commits(s);
-                s->s_dirt = 0;  /* Even if it's not true.
+        s->s_dirt = 0;  /* Even if it's not true.
-                                 * We'll loop forever in sync_supers otherwise */
+                         * We'll loop forever in sync_supers otherwise */
-                reiserfs_write_unlock(s);
+        reiserfs_write_unlock(s);
-        } else {
-                s->s_dirt = 0;
-        }
        return 0;
 }
@@ -468,6 +466,11 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
+        lock_kernel();
+        if (s->s_dirt)
+                reiserfs_write_super(s);
        /* change file system state to current state if it was mounted with read-write permissions */
        if (!(s->s_flags & MS_RDONLY)) {
                if (!journal_begin(&th, s, 10)) {
@@ -500,7 +503,7 @@ static void reiserfs_put_super(struct super_block *s)
        kfree(s->s_fs_info);
        s->s_fs_info = NULL;
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -898,6 +901,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                {"conv",.setmask = 1 << REISERFS_CONVERT},
                {"attrs",.setmask = 1 << REISERFS_ATTRS},
                {"noattrs",.clrmask = 1 << REISERFS_ATTRS},
+                {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
 #ifdef CONFIG_REISERFS_FS_XATTR
                {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
                {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
@@ -1193,6 +1197,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
+        lock_kernel();
        rs = SB_DISK_SUPER_BLOCK(s);
        if (!reiserfs_parse_options
@@ -1315,10 +1320,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 out_ok:
        replace_mount_options(s, new_opts);
+        unlock_kernel();
        return 0;
 out_err:
        kfree(new_opts);
+        unlock_kernel();
        return err;
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8e7deb0e6964..f3d47d856848 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -981,7 +981,8 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                s->s_root->d_op = &xattr_lookup_poison_ops;
+                if (!reiserfs_expose_privroot(s))
+                        s->s_root->d_op = &xattr_lookup_poison_ops;
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fc27fbfc5397..1402d2d54f52 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -474,6 +474,8 @@ smb_put_super(struct super_block *sb)
 {
        struct smb_sb_info *server = SMB_SB(sb);
+        lock_kernel();
        smb_lock_server(server);
        server->state = CONN_INVALID;
        smbiod_unregister_server(server);
@@ -489,6 +491,8 @@ smb_put_super(struct super_block *sb)
        smb_unlock_server(server);
        put_pid(server->conn_pid);
        kfree(server);
+        unlock_kernel();
 }
 static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 0adc624c956f..3b52770f46ff 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -338,6 +338,8 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 static void squashfs_put_super(struct super_block *sb)
 {
+        lock_kernel();
        if (sb->s_fs_info) {
                struct squashfs_sb_info *sbi = sb->s_fs_info;
                squashfs_cache_delete(sbi->block_cache);
@@ -350,6 +352,8 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
+        unlock_kernel();
 }
diff --git a/fs/super.c b/fs/super.c
index 1943fdf655fa..83b47416d006 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -28,7 +28,6 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/namei.h>
-#include <linux/buffer_head.h>          /* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -38,7 +37,6 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
-#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -72,7 +70,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
-                INIT_LIST_HEAD(&s->s_async_list);
                init_rwsem(&s->s_umount);
                mutex_init(&s->s_lock);
                lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -285,38 +282,6 @@ void unlock_super(struct super_block * sb)
 EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.  Requires a second blkdev
- * flush by the caller to complete the operation.
- */
-void __fsync_super(struct super_block *sb)
-{
-        sync_inodes_sb(sb, 0);
-        vfs_dq_sync(sb);
-        lock_super(sb);
-        if (sb->s_dirt && sb->s_op->write_super)
-                sb->s_op->write_super(sb);
-        unlock_super(sb);
-        if (sb->s_op->sync_fs)
-                sb->s_op->sync_fs(sb, 1);
-        sync_blockdev(sb->s_bdev);
-        sync_inodes_sb(sb, 1);
-}
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_super(struct super_block *sb)
-{
-        __fsync_super(sb);
-        return sync_blockdev(sb->s_bdev);
-}
-EXPORT_SYMBOL_GPL(fsync_super);
 /**
 *      generic_shutdown_super  -       common helper for ->kill_sb()
 *      @sb: superblock to kill
@@ -338,21 +303,13 @@ void generic_shutdown_super(struct super_block *sb)
        if (sb->s_root) {
                shrink_dcache_for_umount(sb);
-                fsync_super(sb);
+                sync_filesystem(sb);
-                lock_super(sb);
+                get_fs_excl();
                sb->s_flags &= ~MS_ACTIVE;
-                /*
-                 * wait for asynchronous fs operations to finish before going further
-                 */
-                async_synchronize_full_domain(&sb->s_async_list);
                /* bad name - it should be evict_inodes() */
                invalidate_inodes(sb);
-                lock_kernel();
-                if (sop->write_super && sb->s_dirt)
-                        sop->write_super(sb);
                if (sop->put_super)
                        sop->put_super(sb);
@@ -362,9 +319,7 @@ void generic_shutdown_super(struct super_block *sb)
                           "Self-destruct in 5 seconds.  Have a nice day...\n",
                           sb->s_id);
                }
+                put_fs_excl();
-                unlock_kernel();
-                unlock_super(sb);
        }
        spin_lock(&sb_lock);
        /* should be initialized for __put_super_and_need_restart() */
@@ -441,16 +396,14 @@ void drop_super(struct super_block *sb)
 EXPORT_SYMBOL(drop_super);
-static inline void write_super(struct super_block *sb)
+/**
-{
+ * sync_supers - helper for periodic superblock writeback
-        lock_super(sb);
+ *
-        if (sb->s_root && sb->s_dirt)
+ * Call the write_super method if present on all dirty superblocks in
-                if (sb->s_op->write_super)
+ * the system.  This is for the periodic writeback used by most older
-                        sb->s_op->write_super(sb);
+ * filesystems.  For data integrity superblock writeback use
-        unlock_super(sb);
+ * sync_filesystems() instead.
-}
+ *
-/*
 * Note: check the dirty flag before waiting, so we don't
 * hold up the sync while mounting a device. (The newly
 * mounted device won't need syncing.)
@@ -462,12 +415,15 @@ void sync_supers(void)
        spin_lock(&sb_lock);
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_dirt) {
+                if (sb->s_op->write_super && sb->s_dirt) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
-                        write_super(sb);
+                        if (sb->s_root && sb->s_dirt)
+                                sb->s_op->write_super(sb);
                        up_read(&sb->s_umount);
                        spin_lock(&sb_lock);
                        if (__put_super_and_need_restart(sb))
                                goto restart;
@@ -476,60 +432,6 @@ restart:
        spin_unlock(&sb_lock);
 }
-/*
- * Call the ->sync_fs super_op against all filesystems which are r/w and
- * which implement it.
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync_fs
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync_fs
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
- *
- * (Fabian) Avoid sync_fs with clean fs & wait mode 0
- */
-void sync_filesystems(int wait)
-{
-        struct super_block *sb;
-        static DEFINE_MUTEX(mutex);
-        mutex_lock(&mutex);             /* Could be down_interruptible */
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_op->sync_fs)
-                        continue;
-                if (sb->s_flags & MS_RDONLY)
-                        continue;
-                sb->s_need_sync_fs = 1;
-        }
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_need_sync_fs)
-                        continue;
-                sb->s_need_sync_fs = 0;
-                if (sb->s_flags & MS_RDONLY)
-                        continue;       /* hm.  Was remounted r/o meanwhile */
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                async_synchronize_full_domain(&sb->s_async_list);
-                if (sb->s_root && (wait || sb->s_dirt))
-                        sb->s_op->sync_fs(sb, wait);
-                up_read(&sb->s_umount);
-                /* restart only when sb is no longer on the list */
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-        mutex_unlock(&mutex);
-}
 /**
 *      get_super - get the superblock of a device
 *      @bdev: device to get the superblock for
@@ -616,45 +518,6 @@ out:
 }
 /**
- *      mark_files_ro - mark all files read-only
- *      @sb: superblock in question
- *
- *      All files are marked read-only.  We don't care about pending
- *      delete files so this should be used in 'force' mode only.
- */
-static void mark_files_ro(struct super_block *sb)
-{
-        struct file *f;
-retry:
-        file_list_lock();
-        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
-                struct vfsmount *mnt;
-                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
-                       continue;
-                if (!file_count(f))
-                        continue;
-                if (!(f->f_mode & FMODE_WRITE))
-                        continue;
-                f->f_mode &= ~FMODE_WRITE;
-                if (file_check_writeable(f) != 0)
-                        continue;
-                file_release_write(f);
-                mnt = mntget(f->f_path.mnt);
-                file_list_unlock();
-                /*
-                 * This can sleep, so we can't hold
-                 * the file_list_lock() spinlock.
-                 */
-                mnt_drop_write(mnt);
-                mntput(mnt);
-                goto retry;
-        }
-        file_list_unlock();
-}
-/**
 *      do_remount_sb - asks filesystem to change mount options.
 *      @sb:    superblock in question
 *      @flags: numeric part of options
@@ -675,27 +538,31 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        if (flags & MS_RDONLY)
                acct_auto_close(sb);
        shrink_dcache_sb(sb);
-        fsync_super(sb);
+        sync_filesystem(sb);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
        if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
                if (force)
                        mark_files_ro(sb);
-                else if (!fs_may_remount_ro(sb))
+                else if (!fs_may_remount_ro(sb)) {
+                        unlock_kernel();
                        return -EBUSY;
+                }
                retval = vfs_dq_off(sb, 1);
-                if (retval < 0 && retval != -ENOSYS)
+                if (retval < 0 && retval != -ENOSYS) {
+                        unlock_kernel();
                        return -EBUSY;
+                }
        }
        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        if (sb->s_op->remount_fs) {
-                lock_super(sb);
                retval = sb->s_op->remount_fs(sb, &flags, data);
-                unlock_super(sb);
+                if (retval) {
-                if (retval)
+                        unlock_kernel();
                        return retval;
+                }
        }
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
        if (remount_rw)
@@ -711,18 +578,17 @@ static void do_emergency_remount(struct work_struct *work)
        list_for_each_entry(sb, &super_blocks, s_list) {
                sb->s_count++;
                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
+                down_write(&sb->s_umount);
                if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
                        /*
                         * ->remount_fs needs lock_kernel().
                         *
                         * What lock protects sb->s_flags??
                         */
-                        lock_kernel();
                        do_remount_sb(sb, MS_RDONLY, NULL, 1);
-                        unlock_kernel();
                }
-                drop_super(sb);
+                up_write(&sb->s_umount);
+                put_super(sb);
                spin_lock(&sb_lock);
        }
        spin_unlock(&sb_lock);
diff --git a/fs/sync.c b/fs/sync.c
index 7abc65fbf21d..dd200025af85 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -13,38 +13,123 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include "internal.h"
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
                        SYNC_FILE_RANGE_WAIT_AFTER)
 /*
- * sync everything.  Start out by waking pdflush, because that writes back
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * all queues in parallel.
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
 */
-static void do_sync(unsigned long wait)
+static int __sync_filesystem(struct super_block *sb, int wait)
 {
-        wakeup_pdflush(0);
+        /* Avoid doing twice syncing and cache pruning for quota sync */
-        sync_inodes(0);         /* All mappings, inodes and their blockdevs */
-        vfs_dq_sync(NULL);
-        sync_supers();          /* Write the superblocks */
-        sync_filesystems(0);    /* Start syncing the filesystems */
-        sync_filesystems(wait); /* Waitingly sync the filesystems */
-        sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
        if (!wait)
-                printk("Emergency Sync complete\n");
+                writeout_quota_sb(sb, -1);
-        if (unlikely(laptop_mode))
+        else
-                laptop_sync_completion();
+                sync_quota_sb(sb, -1);
+        sync_inodes_sb(sb, wait);
+        if (sb->s_op->sync_fs)
+                sb->s_op->sync_fs(sb, wait);
+        return __sync_blockdev(sb->s_bdev, wait);
+}
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int sync_filesystem(struct super_block *sb)
+{
+        int ret;
+        /*
+         * We need to be protected against the filesystem going from
+         * r/o to r/w or vice versa.
+         */
+        WARN_ON(!rwsem_is_locked(&sb->s_umount));
+        /*
+         * No point in syncing out anything if the filesystem is read-only.
+         */
+        if (sb->s_flags & MS_RDONLY)
+                return 0;
+        ret = __sync_filesystem(sb, 0);
+        if (ret < 0)
+                return ret;
+        return __sync_filesystem(sb, 1);
+}
+EXPORT_SYMBOL_GPL(sync_filesystem);
+/*
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
+ *
+ * This operation is careful to avoid the livelock which could easily happen
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
+ * is used only here.  We set it against all filesystems and then clear it as
+ * we sync them.  So redirtied filesystems are skipped.
+ *
+ * But if process A is currently running sync_filesystems and then process B
+ * calls sync_filesystems as well, process B will set all the s_need_sync
+ * flags again, which will cause process A to resync everything.  Fix that with
+ * a local mutex.
+ */
+static void sync_filesystems(int wait)
+{
+        struct super_block *sb;
+        static DEFINE_MUTEX(mutex);
+        mutex_lock(&mutex);             /* Could be down_interruptible */
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &super_blocks, s_list)
+                sb->s_need_sync = 1;
+restart:
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (!sb->s_need_sync)
+                        continue;
+                sb->s_need_sync = 0;
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+                        __sync_filesystem(sb, wait);
+                up_read(&sb->s_umount);
+                /* restart only when sb is no longer on the list */
+                spin_lock(&sb_lock);
+                if (__put_super_and_need_restart(sb))
+                        goto restart;
+        }
+        spin_unlock(&sb_lock);
+        mutex_unlock(&mutex);
 }
 SYSCALL_DEFINE0(sync)
 {
-        do_sync(1);
+        sync_filesystems(0);
+        sync_filesystems(1);
+        if (unlikely(laptop_mode))
+                laptop_sync_completion();
        return 0;
 }
 static void do_sync_work(struct work_struct *work)
 {
-        do_sync(0);
+        /*
+         * Sync twice to reduce the possibility we skipped some inodes / pages
+         * because they were temporarily locked
+         */
+        sync_filesystems(0);
+        sync_filesystems(0);
+        printk("Emergency Sync complete\n");
        kfree(work);
 }
@@ -75,10 +160,8 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
        /* sync the superblock to buffers */
        sb = inode->i_sb;
-        lock_super(sb);
        if (sb->s_dirt && sb->s_op->write_super)
                sb->s_op->write_super(sb);
-        unlock_super(sb);
        /* .. finally sync the buffers to disk */
        err = sync_blockdev(sb->s_bdev);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 56f655254bfe..c7798079e644 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ static int sysv_readdir(struct file *, void *, filldir_t);
 const struct file_operations sysv_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = sysv_readdir,
-        .fsync          = sysv_sync_file,
+        .fsync          = simple_fsync,
 };
 static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 589be21d884e..96340c01f4a7 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = sysv_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -34,18 +34,3 @@ const struct inode_operations sysv_file_inode_operations = {
        .truncate       = sysv_truncate,
        .getattr        = sysv_getattr,
 };
-int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        
-        err |= sysv_sync_inode(inode);
-        return err ? -EIO : 0;
-}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index da20b48d350f..479923456a54 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -31,15 +31,13 @@
 #include <asm/byteorder.h>
 #include "sysv.h"
-/* This is only called on sync() and umount(), when s_dirt=1. */
+static int sysv_sync_fs(struct super_block *sb, int wait)
-static void sysv_write_super(struct super_block *sb)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
        unsigned long time = get_seconds(), old_time;
+        lock_super(sb);
        lock_kernel();
-        if (sb->s_flags & MS_RDONLY)
-                goto clean;
        /*
         * If we are going to write out the super block,
@@ -53,18 +51,30 @@ static void sysv_write_super(struct super_block *sb)
                *sbi->s_sb_time = cpu_to_fs32(sbi, time);
                mark_buffer_dirty(sbi->s_bh2);
        }
-clean:
-        sb->s_dirt = 0;
        unlock_kernel();
+        unlock_super(sb);
+        return 0;
+}
+static void sysv_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                sysv_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        lock_super(sb);
        if (sbi->s_forced_ro)
                *flags |= MS_RDONLY;
        if (!(*flags & MS_RDONLY))
                sb->s_dirt = 1;
+        unlock_super(sb);
        return 0;
 }
@@ -72,6 +82,11 @@ static void sysv_put_super(struct super_block *sb)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                sysv_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                /* XXX ext2 also updates the state here */
                mark_buffer_dirty(sbi->s_bh1);
@@ -84,6 +99,8 @@ static void sysv_put_super(struct super_block *sb)
                brelse(sbi->s_bh2);
        kfree(sbi);
+        unlock_kernel();
 }
 static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -236,7 +253,7 @@ bad_inode:
        return ERR_PTR(-EIO);
 }
-static struct buffer_head * sysv_update_inode(struct inode * inode)
+int sysv_write_inode(struct inode *inode, int wait)
 {
        struct super_block * sb = inode->i_sb;
        struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -244,19 +261,21 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
        struct sysv_inode * raw_inode;
        struct sysv_inode_info * si;
        unsigned int ino, block;
+        int err = 0;
        ino = inode->i_ino;
        if (!ino || ino > sbi->s_ninodes) {
                printk("Bad inode number on dev %s: %d is out of range\n",
                       inode->i_sb->s_id, ino);
-                return NULL;
+                return -EIO;
        }
        raw_inode = sysv_raw_inode(sb, ino, &bh);
        if (!raw_inode) {
                printk("unable to read i-node block\n");
-                return NULL;
+                return -EIO;
        }
+        lock_kernel();
        raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
        raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid));
        raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid));
@@ -272,38 +291,23 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
        for (block = 0; block < 10+1+1+1; block++)
                write3byte(sbi, (u8 *)&si->i_data[block],
                        &raw_inode->i_data[3*block]);
+        unlock_kernel();
        mark_buffer_dirty(bh);
-        return bh;
+        if (wait) {
-}
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-int sysv_write_inode(struct inode * inode, int wait)
+                        printk ("IO error syncing sysv inode [%s:%08x]\n",
-{
+                                sb->s_id, ino);
-        struct buffer_head *bh;
+                        err = -EIO;
-        lock_kernel();
+                }
-        bh = sysv_update_inode(inode);
+        }
        brelse(bh);
-        unlock_kernel();
        return 0;
 }
-int sysv_sync_inode(struct inode * inode)
+int sysv_sync_inode(struct inode *inode)
 {
-        int err = 0;
+        return sysv_write_inode(inode, 1);
-        struct buffer_head *bh;
-        bh = sysv_update_inode(inode);
-        if (bh && buffer_dirty(bh)) {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                        printk ("IO error syncing sysv inode [%s:%08lx]\n",
-                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
-                }
-        }
-        else if (!bh)
-                err = -1;
-        brelse (bh);
-        return err;
 }
 static void sysv_delete_inode(struct inode *inode)
@@ -347,6 +351,7 @@ const struct super_operations sysv_sops = {
        .delete_inode   = sysv_delete_inode,
        .put_super      = sysv_put_super,
        .write_super    = sysv_write_super,
+        .sync_fs        = sysv_sync_fs,
        .remount_fs     = sysv_remount,
        .statfs         = sysv_statfs,
 };
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 5784a318c883..53786eb5cf60 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -144,7 +144,6 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, int);
 extern int sysv_sync_inode(struct inode *);
-extern int sysv_sync_file(struct file *, struct dentry *, int);
 extern void sysv_set_inode(struct inode *, dev_t);
 extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int sysv_init_icache(void);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e9f7a754c4f7..3589eab02a2f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,7 @@
 #include <linux/mount.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
+#include <linux/smp_lock.h>
 #include "ubifs.h"
 /*
@@ -447,9 +448,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
        if (!wait)
                return 0;
-        if (sb->s_flags & MS_RDONLY)
-                return 0;
        /*
         * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
         * pages, so synchronize them first, then commit the journal. Strictly
@@ -1687,6 +1685,9 @@ static void ubifs_put_super(struct super_block *sb)
        ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
                  c->vi.vol_id);
+        lock_kernel();
        /*
         * The following asserts are only valid if there has not been a failure
         * of the media. For example, there will be dirty inodes if we failed
@@ -1753,6 +1754,8 @@ static void ubifs_put_super(struct super_block *sb)
        ubi_close_volume(c->ubi);
        mutex_unlock(&c->umount_mutex);
        kfree(c);
+        unlock_kernel();
 }
 static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1768,17 +1771,22 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                return err;
        }
+        lock_kernel();
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
                if (c->ro_media) {
                        ubifs_msg("cannot re-mount due to prior errors");
+                        unlock_kernel();
                        return -EROFS;
                }
                err = ubifs_remount_rw(c);
-                if (err)
+                if (err) {
+                        unlock_kernel();
                        return err;
+                }
        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
                if (c->ro_media) {
                        ubifs_msg("cannot re-mount due to prior errors");
+                        unlock_kernel();
                        return -EROFS;
                }
                ubifs_remount_ro(c);
@@ -1793,6 +1801,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
        }
        ubifs_assert(c->lst.taken_empty_lebs > 0);
+        unlock_kernel();
        return 0;
 }
diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index 0d4503f7446d..eb880f66c23a 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_UDF_FS) += udf.o
 udf-objs     := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
-                partition.o super.o truncate.o symlink.o fsync.o \
+                partition.o super.o truncate.o symlink.o \
                directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 2efd4d5291b6..61d9a76a3a69 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = {
        .read                   = generic_read_dir,
        .readdir                = udf_readdir,
        .ioctl                  = udf_ioctl,
-        .fsync                  = udf_fsync_file,
+        .fsync                  = simple_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index eb91f3b70320..7464305382b5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = {
        .write                  = do_sync_write,
        .aio_write              = udf_file_aio_write,
        .release                = udf_release_file,
-        .fsync                  = udf_fsync_file,
+        .fsync                  = simple_fsync,
        .splice_read            = generic_file_splice_read,
        .llseek                 = generic_file_llseek,
 };
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
deleted file mode 100644
index b2c472b733b8..000000000000
--- a/fs/udf/fsync.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * fsync.c
- *
- * PURPOSE
- *  Fsync handling routines for the OSTA-UDF(tm) filesystem.
- *
- * COPYRIGHT
- *  This file is distributed under the terms of the GNU General Public
- *  License (GPL). Copies of the GPL can be obtained from:
- *      ftp://prep.ai.mit.edu/pub/gnu/GPL
- *  Each contributing author retains all rights to their own work.
- *
- *  (C) 1999-2001 Ben Fennema
- *  (C) 1999-2000 Stelias Computing Inc
- *
- * HISTORY
- *
- *  05/22/99 blf  Created.
- */
-#include "udfdecl.h"
-#include <linux/fs.h>
-static int udf_fsync_inode(struct inode *, int);
-/*
- *      File may be NULL when we are called. Perhaps we shouldn't
- *      even pass file to fsync ?
- */
-int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        return udf_fsync_inode(inode, datasync);
-}
-static int udf_fsync_inode(struct inode *inode, int datasync)
-{
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        err |= udf_sync_inode(inode);
-        return err ? -EIO : 0;
-}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 0ba44107d8f1..6832135159b6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -568,6 +568,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
+        lock_kernel();
        sbi->s_flags = uopt.flags;
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
@@ -581,13 +582,16 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                        *flags |= MS_RDONLY;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+                unlock_kernel();
                return 0;
+        }
        if (*flags & MS_RDONLY)
                udf_close_lvid(sb);
        else
                udf_open_lvid(sb);
+        unlock_kernel();
        return 0;
 }
@@ -2062,6 +2066,9 @@ static void udf_put_super(struct super_block *sb)
        struct udf_sb_info *sbi;
        sbi = UDF_SB(sb);
+        lock_kernel();
        if (sbi->s_vat_inode)
                iput(sbi->s_vat_inode);
        if (sbi->s_partitions)
@@ -2077,6 +2084,8 @@ static void udf_put_super(struct super_block *sb)
        kfree(sbi->s_partmaps);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int udf_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index cac51b77a5d1..8d46f4294ee7 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -223,9 +223,6 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
                         uint32_t, int *);
-/* fsync.c */
-extern int udf_fsync_file(struct file *, struct dentry *, int);
 /* directory.c */
 extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
                                                struct udf_fileident_bh *,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6321b797061b..6f671f1ac271 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = ufs_readdir,
-        .fsync          = ufs_sync_file,
+        .fsync          = simple_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 2bd3a1615714..73655c61240a 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,31 +24,10 @@
 */
 #include <linux/fs.h>
-#include <linux/buffer_head.h>  /* for sync_mapping_buffers() */
 #include "ufs_fs.h"
 #include "ufs.h"
-int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        int ret;
-        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return ret;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return ret;
-        err = ufs_sync_inode(inode);
-        if (ret == 0)
-                ret = err;
-        return ret;
-}
 /*
 * We have mostly NULL's here: the current defaults are ok for
 * the ufs filesystem.
@@ -62,6 +41,6 @@ const struct file_operations ufs_file_operations = {
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,
-        .fsync          = ufs_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 60359291761f..5faed7954d0a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -263,6 +263,7 @@ void ufs_panic (struct super_block * sb, const char * function,
        struct ufs_super_block_first * usb1;
        va_list args;
        
+        lock_kernel();
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        
@@ -594,6 +595,9 @@ static void ufs_put_super_internal(struct super_block *sb)
        
        UFSD("ENTER\n");
+        lock_kernel();
        ufs_put_cstotal(sb);
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -621,6 +625,9 @@ static void ufs_put_super_internal(struct super_block *sb)
                brelse (sbi->s_ucg[i]);
        kfree (sbi->s_ucg);
        kfree (base);
+        unlock_kernel();
        UFSD("EXIT\n");
 }
@@ -1118,32 +1125,45 @@ failed_nomem:
        return -ENOMEM;
 }
-static void ufs_write_super(struct super_block *sb)
+static int ufs_sync_fs(struct super_block *sb, int wait)
 {
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
        struct ufs_super_block_third * usb3;
        unsigned flags;
+        lock_super(sb);
        lock_kernel();
        UFSD("ENTER\n");
        flags = UFS_SB(sb)->s_flags;
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        usb3 = ubh_get_usb_third(uspi);
-        if (!(sb->s_flags & MS_RDONLY)) {
+        usb1->fs_time = cpu_to_fs32(sb, get_seconds());
-                usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+        if ((flags & UFS_ST_MASK) == UFS_ST_SUN  ||
-                if ((flags & UFS_ST_MASK) == UFS_ST_SUN 
+            (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
-                  || (flags & UFS_ST_MASK) == UFS_ST_SUNOS
+            (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-                  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+                ufs_set_fs_state(sb, usb1, usb3,
-                        ufs_set_fs_state(sb, usb1, usb3,
+                                UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-                                        UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+        ufs_put_cstotal(sb);
-                ufs_put_cstotal(sb);
-        }
        sb->s_dirt = 0;
        UFSD("EXIT\n");
        unlock_kernel();
+        unlock_super(sb);
+        return 0;
+}
+static void ufs_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                ufs_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static void ufs_put_super(struct super_block *sb)
@@ -1152,6 +1172,9 @@ static void ufs_put_super(struct super_block *sb)
                
        UFSD("ENTER\n");
+        if (sb->s_dirt)
+                ufs_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY))
                ufs_put_super_internal(sb);
        
@@ -1171,7 +1194,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        struct ufs_super_block_third * usb3;
        unsigned new_mount_opt, ufstype;
        unsigned flags;
-        
+        lock_kernel();
+        lock_super(sb);
        uspi = UFS_SB(sb)->s_uspi;
        flags = UFS_SB(sb)->s_flags;
        usb1 = ubh_get_usb_first(uspi);
@@ -1184,17 +1209,24 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
        new_mount_opt = 0;
        ufs_set_opt (new_mount_opt, ONERROR_LOCK);
-        if (!ufs_parse_options (data, &new_mount_opt))
+        if (!ufs_parse_options (data, &new_mount_opt)) {
+                unlock_super(sb);
+                unlock_kernel();
                return -EINVAL;
+        }
        if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
                new_mount_opt |= ufstype;
        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                printk("ufstype can't be changed during remount\n");
+                unlock_super(sb);
+                unlock_kernel();
                return -EINVAL;
        }
        if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                UFS_SB(sb)->s_mount_opt = new_mount_opt;
+                unlock_super(sb);
+                unlock_kernel();
                return 0;
        }
        
@@ -1219,6 +1251,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 #ifndef CONFIG_UFS_FS_WRITE
                printk("ufs was compiled with read-only support, "
                "can't be mounted as read-write\n");
+                unlock_super(sb);
+                unlock_kernel();
                return -EINVAL;
 #else
                if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1227,16 +1261,22 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
                    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
                        printk("this ufstype is read-only supported\n");
+                        unlock_super(sb);
+                        unlock_kernel();
                        return -EINVAL;
                }
                if (!ufs_read_cylinder_structures(sb)) {
                        printk("failed during remounting\n");
+                        unlock_super(sb);
+                        unlock_kernel();
                        return -EPERM;
                }
                sb->s_flags &= ~MS_RDONLY;
 #endif
        }
        UFS_SB(sb)->s_mount_opt = new_mount_opt;
+        unlock_super(sb);
+        unlock_kernel();
        return 0;
 }
@@ -1352,6 +1392,7 @@ static const struct super_operations ufs_super_ops = {
        .delete_inode   = ufs_delete_inode,
        .put_super      = ufs_put_super,
        .write_super    = ufs_write_super,
+        .sync_fs        = ufs_sync_fs,
        .statfs         = ufs_statfs,
        .remount_fs     = ufs_remount,
        .show_options   = ufs_show_options,
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index d0c4acd4f1f3..644e77e13599 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -99,7 +99,6 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 extern const struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
 extern const struct address_space_operations ufs_aops;
-extern int ufs_sync_file(struct file *, struct dentry *, int);
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index d51b8f9db921..1c3d0af59ddf 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                return error;
        dentry = f->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = mnt_want_write(f->f_path.mnt);
+        error = mnt_want_write_file(f);
        if (!error) {
                error = setxattr(dentry, name, value, size, flags);
                mnt_drop_write(f->f_path.mnt);
@@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
                return error;
        dentry = f->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = mnt_want_write(f->f_path.mnt);
+        error = mnt_want_write_file(f);
        if (!error) {
                error = removexattr(dentry, name);
                mnt_drop_write(f->f_path.mnt);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bb685269f832..08d6bd9a3947 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1104,15 +1104,6 @@ xfs_fs_put_super(
        kfree(mp);
 }
-STATIC void
-xfs_fs_write_super(
-        struct super_block      *sb)
-{
-        if (!(sb->s_flags & MS_RDONLY))
-                xfs_sync_fsdata(XFS_M(sb), 0);
-        sb->s_dirt = 0;
-}
 STATIC int
 xfs_fs_sync_super(
        struct super_block      *sb,
@@ -1137,7 +1128,6 @@ xfs_fs_sync_super(
                error = xfs_quiesce_data(mp);
        else
                error = xfs_sync_fsdata(mp, 0);
-        sb->s_dirt = 0;
        if (unlikely(laptop_mode)) {
                int     prev_sync_seq = mp->m_sync_seq;
@@ -1443,7 +1433,6 @@ xfs_fs_fill_super(
        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
-        sb->s_dirt = 1;
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1533,7 +1522,6 @@ static struct super_operations xfs_super_operations = {
        .write_inode            = xfs_fs_write_inode,
        .clear_inode            = xfs_fs_clear_inode,
        .put_super              = xfs_fs_put_super,
-        .write_super            = xfs_fs_write_super,
        .sync_fs                = xfs_fs_sync_super,
        .freeze_fs              = xfs_fs_freeze,
        .statfs                 = xfs_fs_statfs,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8570b826fedd..bcc39d358ad3 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -628,8 +628,6 @@ xfs_trans_apply_sb_deltas(
                xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
                                  offsetof(xfs_dsb_t, sb_frextents) +
                                  sizeof(sbp->sb_frextents) - 1);
-        tp->t_mountp->m_super->s_dirt = 1;
 }
 /*