311 files changed, 11515 insertions, 8745 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 9f7270f36b2a..525da2e8f73b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -62,6 +62,16 @@ source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
+config CUSE
+        tristate "Character device in Userpace support"
+        depends on FUSE_FS
+        help
+          This FUSE extension allows character devices to be
+          implemented in userspace.
+          If you want to develop or use userspace character device
+          based on CUSE, answer Y or M.
 config GENERIC_ACL
        bool
        select FS_POSIX_ACL
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index e0a85dbeeb88..a6665f37f456 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -53,6 +53,7 @@ struct adfs_dir_ops {
        int     (*update)(struct adfs_dir *dir, struct object_info *obj);
        int     (*create)(struct adfs_dir *dir, struct object_info *obj);
        int     (*remove)(struct adfs_dir *dir, struct object_info *obj);
+        int     (*sync)(struct adfs_dir *dir);
        void    (*free)(struct adfs_dir *dir);
 };
@@ -90,7 +91,8 @@ extern const struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
-extern int adfs_dir_update(struct super_block *sb, struct object_info *obj);
+extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
+                           int wait);
 /* file.c */
 extern const struct inode_operations adfs_file_inode_operations;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index e867ccf37246..4d4073447d1a 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -83,7 +83,7 @@ out:
 }
 int
-adfs_dir_update(struct super_block *sb, struct object_info *obj)
+adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
 {
        int ret = -EINVAL;
 #ifdef CONFIG_ADFS_FS_RW
@@ -106,6 +106,12 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj)
        ret = ops->update(&dir, obj);
        write_unlock(&adfs_dir_lock);
+        if (wait) {
+                int err = ops->sync(&dir);
+                if (!ret)
+                        ret = err;
+        }
        ops->free(&dir);
 out:
 #endif
@@ -199,7 +205,7 @@ const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = adfs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
 };
 static int
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index ea7df2146921..31df6adf0de6 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -437,6 +437,22 @@ bad_dir:
 #endif
 }
+static int
+adfs_f_sync(struct adfs_dir *dir)
+{
+        int err = 0;
+        int i;
+        for (i = dir->nr_buffers - 1; i >= 0; i--) {
+                struct buffer_head *bh = dir->bh[i];
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        err = -EIO;
+        }
+        return err;
+}
 static void
 adfs_f_free(struct adfs_dir *dir)
 {
@@ -456,5 +472,6 @@ struct adfs_dir_ops adfs_f_dir_ops = {
        .setpos         = adfs_f_setpos,
        .getnext        = adfs_f_getnext,
        .update         = adfs_f_update,
+        .sync           = adfs_f_sync,
        .free           = adfs_f_free
 };
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1ec644e32df9..139e0f345f18 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -161,6 +161,22 @@ out:
        return ret;
 }
+static int
+adfs_fplus_sync(struct adfs_dir *dir)
+{
+        int err = 0;
+        int i;
+        for (i = dir->nr_buffers - 1; i >= 0; i--) {
+                struct buffer_head *bh = dir->bh[i];
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        err = -EIO;
+        }
+        return err;
+}
 static void
 adfs_fplus_free(struct adfs_dir *dir)
 {
@@ -175,5 +191,6 @@ struct adfs_dir_ops adfs_fplus_dir_ops = {
        .read           = adfs_fplus_read,
        .setpos         = adfs_fplus_setpos,
        .getnext        = adfs_fplus_getnext,
+        .sync           = adfs_fplus_sync,
        .free           = adfs_fplus_free
 };
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 36e381c6a99a..8224d54a2afb 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -30,7 +30,7 @@ const struct file_operations adfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .mmap           = generic_file_mmap,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e647200262a2..05b3a677201d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -376,7 +376,7 @@ out:
 * The adfs-specific inode data has already been updated by
 * adfs_notify_change()
 */
-int adfs_write_inode(struct inode *inode, int unused)
+int adfs_write_inode(struct inode *inode, int wait)
 {
        struct super_block *sb = inode->i_sb;
        struct object_info obj;
@@ -391,7 +391,7 @@ int adfs_write_inode(struct inode *inode, int unused)
        obj.attr        = ADFS_I(inode)->attr;
        obj.size        = inode->i_size;
-        ret = adfs_dir_update(sb, &obj);
+        ret = adfs_dir_update(sb, &obj, wait);
        unlock_kernel();
        return ret;
 }
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 92ab4fbc2031..568081b93f73 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -62,7 +62,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
 #define GET_FRAG_ID(_map,_start,_idmask)                                \
        ({                                                              \
                unsigned char *_m = _map + (_start >> 3);               \
-                u32 _frag = get_unaligned((u32 *)_m);                   \
+                u32 _frag = get_unaligned_le32(_m);                     \
                _frag >>= (_start & 7);                                 \
                _frag & _idmask;                                        \
        })
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index dd9becca4241..0ec5aaf47aa7 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -132,11 +132,15 @@ static void adfs_put_super(struct super_block *sb)
        int i;
        struct adfs_sb_info *asb = ADFS_SB(sb);
+        lock_kernel();
        for (i = 0; i < asb->s_map_size; i++)
                brelse(asb->s_map[i].dm_bh);
        kfree(asb->s_map);
        kfree(asb);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 1a2d5e3c7f4e..e511dc621a2e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -182,6 +182,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 void            affs_free_prealloc(struct inode *inode);
 extern void     affs_truncate(struct inode *);
+int             affs_file_fsync(struct file *, struct dentry *, int);
 /* dir.c */
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 7b36904dbeac..8ca8f3a55599 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = affs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = affs_file_fsync,
 };
 /*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 9246cb4aa018..184e55c1c9ba 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = {
        .mmap           = generic_file_mmap,
        .open           = affs_file_open,
        .release        = affs_file_release,
-        .fsync          = file_fsync,
+        .fsync          = affs_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -915,3 +915,15 @@ affs_truncate(struct inode *inode)
        }
        affs_free_prealloc(inode);
 }
+int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+        struct inode * inode = dentry->d_inode;
+        int ret, err;
+        ret = write_inode_now(inode, 0);
+        err = sync_blockdev(inode->i_sb->s_bdev);
+        if (!ret)
+                ret = err;
+        return ret;
+}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 63f5183f263b..104fdcb3a7fc 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "affs.h"
 extern struct timezone sys_tz;
@@ -24,49 +25,67 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 static void
+affs_commit_super(struct super_block *sb, int clean)
+{
+        struct affs_sb_info *sbi = AFFS_SB(sb);
+        struct buffer_head *bh = sbi->s_root_bh;
+        struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
+        tail->bm_flag = cpu_to_be32(clean);
+        secs_to_datestamp(get_seconds(), &tail->disk_change);
+        affs_fix_checksum(sb, bh);
+        mark_buffer_dirty(bh);
+}
+static void
 affs_put_super(struct super_block *sb)
 {
        struct affs_sb_info *sbi = AFFS_SB(sb);
        pr_debug("AFFS: put_super()\n");
-        if (!(sb->s_flags & MS_RDONLY)) {
+        lock_kernel();
-                AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1);
-                secs_to_datestamp(get_seconds(),
+        if (!(sb->s_flags & MS_RDONLY))
-                                  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
+                affs_commit_super(sb, 1);
-                affs_fix_checksum(sb, sbi->s_root_bh);
-                mark_buffer_dirty(sbi->s_root_bh);
-        }
        kfree(sbi->s_prefix);
        affs_free_bitmap(sb);
        affs_brelse(sbi->s_root_bh);
        kfree(sbi);
        sb->s_fs_info = NULL;
-        return;
+        unlock_kernel();
 }
 static void
 affs_write_super(struct super_block *sb)
 {
        int clean = 2;
-        struct affs_sb_info *sbi = AFFS_SB(sb);
+        lock_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                //      if (sbi->s_bitmap[i].bm_bh) {
                //              if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
                //                      clean = 0;
-                AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean);
+                affs_commit_super(sb, clean);
-                secs_to_datestamp(get_seconds(),
-                                  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
-                affs_fix_checksum(sb, sbi->s_root_bh);
-                mark_buffer_dirty(sbi->s_root_bh);
                sb->s_dirt = !clean;    /* redo until bitmap synced */
        } else
                sb->s_dirt = 0;
+        unlock_super(sb);
        pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
 }
+static int
+affs_sync_fs(struct super_block *sb, int wait)
+{
+        lock_super(sb);
+        affs_commit_super(sb, 2);
+        sb->s_dirt = 0;
+        unlock_super(sb);
+        return 0;
+}
 static struct kmem_cache * affs_inode_cachep;
 static struct inode *affs_alloc_inode(struct super_block *sb)
@@ -124,6 +143,7 @@ static const struct super_operations affs_sops = {
        .clear_inode    = affs_clear_inode,
        .put_super      = affs_put_super,
        .write_super    = affs_write_super,
+        .sync_fs        = affs_sync_fs,
        .statfs         = affs_statfs,
        .remount_fs     = affs_remount,
        .show_options   = generic_show_options,
@@ -507,6 +527,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
                kfree(new_opts);
                return -EINVAL;
        }
+        lock_kernel();
        replace_mount_options(sb, new_opts);
        sbi->s_flags = mount_flags;
@@ -514,8 +535,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        sbi->s_uid   = uid;
        sbi->s_gid   = gid;
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+                unlock_kernel();
                return 0;
+        }
        if (*flags & MS_RDONLY) {
                sb->s_dirt = 1;
                while (sb->s_dirt)
@@ -524,6 +547,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
        } else
                res = affs_init_bitmap(sb, flags);
+        unlock_kernel();
        return res;
 }
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2b9e2d03a390..c52be53f6946 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -244,7 +244,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
        case -EBUSY:
                /* someone else made a mount here whilst we were busy */
                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path.mnt, &nd->path.dentry))
+                       follow_down(&nd->path))
                        ;
                err = 0;
        default:
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 76828e5f8a39..ad0514d0115f 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -440,8 +440,12 @@ static void afs_put_super(struct super_block *sb)
        _enter("");
+        lock_kernel();
        afs_put_volume(as->volume);
+        unlock_kernel();
        _leave("");
 }
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 4eb4d8dfb2f1..2316e944a109 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -85,13 +85,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
                }
                path.mnt = mnt;
                path_get(&path);
-                if (!follow_down(&path.mnt, &path.dentry)) {
+                if (!follow_down(&path)) {
                        path_put(&path);
                        DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
                        continue;
                }
-                while (d_mountpoint(path.dentry) &&
+                while (d_mountpoint(path.dentry) && follow_down(&path));
-                       follow_down(&path.mnt, &path.dentry))
                        ;
                umount_ok = may_umount(path.mnt);
                path_put(&path);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b7ff33c63101..8f7cdde41733 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
-static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static inline int autofs4_follow_mount(struct path *path)
 {
        int res = 0;
-        while (d_mountpoint(*dentry)) {
+        while (d_mountpoint(path->dentry)) {
-                int followed = follow_down(mnt, dentry);
+                int followed = follow_down(path);
                if (!followed)
                        break;
                res = 1;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 84168c0dcc2d..f3da2eb51f56 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -192,77 +192,42 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
        return 0;
 }
-/*
+static int find_autofs_mount(const char *pathname,
- * Walk down the mount stack looking for an autofs mount that
+                             struct path *res,
- * has the requested device number (aka. new_encode_dev(sb->s_dev).
+                             int test(struct path *path, void *data),
- */
+                             void *data)
-static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
 {
-        struct dentry *dentry;
+        struct path path;
-        struct inode *inode;
+        int err = kern_path(pathname, 0, &path);
-        struct super_block *sb;
+        if (err)
-        dev_t s_dev;
+                return err;
-        unsigned int err;
        err = -ENOENT;
+        while (path.dentry == path.mnt->mnt_root) {
-        /* Lookup the dentry name at the base of our mount point */
+                if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
-        dentry = d_lookup(nd->path.dentry, &nd->last);
+                        if (test(&path, data)) {
-        if (!dentry)
+                                path_get(&path);
-                goto out;
+                                if (!err) /* already found some */
+                                        path_put(res);
-        dput(nd->path.dentry);
+                                *res = path;
-        nd->path.dentry = dentry;
-        /* And follow the mount stack looking for our autofs mount */
-        while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
-                inode = nd->path.dentry->d_inode;
-                if (!inode)
-                        break;
-                sb = inode->i_sb;
-                s_dev = new_encode_dev(sb->s_dev);
-                if (devno == s_dev) {
-                        if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
                                err = 0;
-                                break;
                        }
                }
+                if (!follow_up(&path))
+                        break;
        }
-out:
+        path_put(&path);
        return err;
 }
-/*
+static int test_by_dev(struct path *path, void *p)
- * Walk down the mount stack looking for an autofs mount that
- * has the requested mount type (ie. indirect, direct or offset).
- */
-static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
 {
-        struct dentry *dentry;
+        return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
-        struct autofs_info *ino;
+}
-        unsigned int err;
-        err = -ENOENT;
-        /* Lookup the dentry name at the base of our mount point */
-        dentry = d_lookup(nd->path.dentry, &nd->last);
-        if (!dentry)
-                goto out;
-        dput(nd->path.dentry);
-        nd->path.dentry = dentry;
-        /* And follow the mount stack looking for our autofs mount */
+static int test_by_type(struct path *path, void *p)
-        while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
+{
-                ino = autofs4_dentry_ino(nd->path.dentry);
+        struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
-                if (ino && ino->sbi->type & type) {
+        return ino && ino->sbi->type & *(unsigned *)p;
-                        err = 0;
-                        break;
-                }
-        }
-out:
-        return err;
 }
 static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
@@ -283,31 +248,25 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
 * Open a file descriptor on the autofs mount point corresponding
 * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
 */
-static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
+static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 {
-        struct file *filp;
-        struct nameidata nd;
        int err, fd;
        fd = get_unused_fd();
        if (likely(fd >= 0)) {
-                /* Get nameidata of the parent directory */
+                struct file *filp;
-                err = path_lookup(path, LOOKUP_PARENT, &nd);
+                struct path path;
+                err = find_autofs_mount(name, &path, test_by_dev, &devid);
                if (err)
                        goto out;
                /*
-                 * Search down, within the parent, looking for an
+                 * Find autofs super block that has the device number
-                 * autofs super block that has the device number
                 * corresponding to the autofs fs we want to open.
                 */
-                err = autofs_dev_ioctl_find_super(&nd, devid);
-                if (err) {
-                        path_put(&nd.path);
-                        goto out;
-                }
-                filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
+                filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
                                   current_cred());
                if (IS_ERR(filp)) {
                        err = PTR_ERR(filp);
@@ -340,7 +299,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
        param->ioctlfd = -1;
        path = param->path;
-        devid = param->openmount.devid;
+        devid = new_decode_dev(param->openmount.devid);
        err = 0;
        fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -475,8 +434,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                                      struct autofs_dev_ioctl *param)
 {
        struct autofs_info *ino;
-        struct nameidata nd;
+        struct path path;
-        const char *path;
        dev_t devid;
        int err = -ENOENT;
@@ -485,32 +443,24 @@ static int autofs_dev_ioctl_requester(struct file *fp,
                goto out;
        }
-        path = param->path;
+        devid = sbi->sb->s_dev;
-        devid = new_encode_dev(sbi->sb->s_dev);
        param->requester.uid = param->requester.gid = -1;
-        /* Get nameidata of the parent directory */
+        err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
-        err = path_lookup(path, LOOKUP_PARENT, &nd);
        if (err)
                goto out;
-        err = autofs_dev_ioctl_find_super(&nd, devid);
+        ino = autofs4_dentry_ino(path.dentry);
-        if (err)
-                goto out_release;
-        ino = autofs4_dentry_ino(nd.path.dentry);
        if (ino) {
                err = 0;
-                autofs4_expire_wait(nd.path.dentry);
+                autofs4_expire_wait(path.dentry);
                spin_lock(&sbi->fs_lock);
                param->requester.uid = ino->uid;
                param->requester.gid = ino->gid;
                spin_unlock(&sbi->fs_lock);
        }
+        path_put(&path);
-out_release:
-        path_put(&nd.path);
 out:
        return err;
 }
@@ -569,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                                         struct autofs_sb_info *sbi,
                                         struct autofs_dev_ioctl *param)
 {
-        struct nameidata nd;
+        struct path path;
-        const char *path;
+        const char *name;
        unsigned int type;
        unsigned int devid, magic;
        int err = -ENOENT;
@@ -580,71 +530,46 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
                goto out;
        }
-        path = param->path;
+        name = param->path;
        type = param->ismountpoint.in.type;
        param->ismountpoint.out.devid = devid = 0;
        param->ismountpoint.out.magic = magic = 0;
        if (!fp || param->ioctlfd == -1) {
-                if (autofs_type_any(type)) {
+                if (autofs_type_any(type))
-                        struct super_block *sb;
+                        err = kern_path(name, LOOKUP_FOLLOW, &path);
+                else
-                        err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+                        err = find_autofs_mount(name, &path, test_by_type, &type);
-                        if (err)
+                if (err)
-                                goto out;
+                        goto out;
+                devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
-                        sb = nd.path.dentry->d_sb;
-                        devid = new_encode_dev(sb->s_dev);
-                } else {
-                        struct autofs_info *ino;
-                        err = path_lookup(path, LOOKUP_PARENT, &nd);
-                        if (err)
-                                goto out;
-                        err = autofs_dev_ioctl_find_sbi_type(&nd, type);
-                        if (err)
-                                goto out_release;
-                        ino = autofs4_dentry_ino(nd.path.dentry);
-                        devid = autofs4_get_dev(ino->sbi);
-                }
                err = 0;
-                if (nd.path.dentry->d_inode &&
+                if (path.dentry->d_inode &&
-                    nd.path.mnt->mnt_root == nd.path.dentry) {
+                    path.mnt->mnt_root == path.dentry) {
                        err = 1;
-                        magic = nd.path.dentry->d_inode->i_sb->s_magic;
+                        magic = path.dentry->d_inode->i_sb->s_magic;
                }
        } else {
-                dev_t dev = autofs4_get_dev(sbi);
+                dev_t dev = sbi->sb->s_dev;
-                err = path_lookup(path, LOOKUP_PARENT, &nd);
+                err = find_autofs_mount(name, &path, test_by_dev, &dev);
                if (err)
                        goto out;
-                err = autofs_dev_ioctl_find_super(&nd, dev);
+                devid = new_encode_dev(dev);
-                if (err)
-                        goto out_release;
-                devid = dev;
-                err = have_submounts(nd.path.dentry);
+                err = have_submounts(path.dentry);
-                if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
+                if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
-                        if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
+                        if (follow_down(&path))
-                                struct inode *inode = nd.path.dentry->d_inode;
+                                magic = path.mnt->mnt_sb->s_magic;
-                                magic = inode->i_sb->s_magic;
-                        }
                }
        }
        param->ismountpoint.out.devid = devid;
        param->ismountpoint.out.magic = magic;
+        path_put(&path);
-out_release:
-        path_put(&nd.path);
 out:
        return err;
 }
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3077d8f16523..aa39ae83f019 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 {
        struct dentry *top = dentry;
+        struct path path = {.mnt = mnt, .dentry = dentry};
        int status = 1;
        DPRINTK("dentry %p %.*s",
                dentry, (int)dentry->d_name.len, dentry->d_name.name);
-        mntget(mnt);
+        path_get(&path);
-        dget(dentry);
-        if (!follow_down(&mnt, &dentry))
+        if (!follow_down(&path))
                goto done;
-        if (is_autofs4_dentry(dentry)) {
+        if (is_autofs4_dentry(path.dentry)) {
-                struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+                struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
                /* This is an autofs submount, we can't expire it */
                if (autofs_type_indirect(sbi->type))
@@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
                 * Otherwise it's an offset mount and we need to check
                 * if we can umount its mount, if there is one.
                 */
-                if (!d_mountpoint(dentry)) {
+                if (!d_mountpoint(path.dentry)) {
                        status = 0;
                        goto done;
                }
@@ -86,8 +86,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        status = 0;
 done:
        DPRINTK("returning = %d", status);
-        dput(dentry);
+        path_put(&path);
-        mntput(mnt);
        return status;
 }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e383bf0334f1..b96a3c57359d 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                nd->flags);
        /*
         * For an expire of a covered direct or offset mount we need
-         * to beeak out of follow_down() at the autofs mount trigger
+         * to break out of follow_down() at the autofs mount trigger
         * (d_mounted--), so we can see the expiring flag, and manage
         * the blocking and following here until the expire is completed.
         */
@@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                if (ino->flags & AUTOFS_INF_EXPIRING) {
                        spin_unlock(&sbi->fs_lock);
                        /* Follow down to our covering mount. */
-                        if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+                        if (!follow_down(&nd->path))
                                goto done;
                        goto follow;
                }
@@ -230,8 +230,7 @@ follow:
         * to follow it.
         */
        if (d_mountpoint(dentry)) {
-                if (!autofs4_follow_mount(&nd->path.mnt,
+                if (!autofs4_follow_mount(&nd->path)) {
-                                          &nd->path.dentry)) {
                        status = -ENOENT;
                        goto out_error;
                }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 76afd0d6b86c..9367b6297d84 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,6 +737,8 @@ parse_options(char *options, befs_mount_options * opts)
 static void
 befs_put_super(struct super_block *sb)
 {
+        lock_kernel();
        kfree(BEFS_SB(sb)->mount_opts.iocharset);
        BEFS_SB(sb)->mount_opts.iocharset = NULL;
@@ -747,7 +749,8 @@ befs_put_super(struct super_block *sb)
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
-        return;
+        unlock_kernel();
 }
 /* Allocate private field of the superblock, fill it.
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 4dd1b623f937..54bd07d44e68 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -79,7 +79,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = bfs_readdir,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
        .llseek         = generic_file_llseek,
 };
@@ -205,7 +205,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
                inode->i_nlink = 1;
        }
        de->ino = 0;
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        mark_inode_dirty(dir);
        inode->i_ctime = dir->i_ctime;
@@ -267,7 +267,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                new_inode->i_ctime = CURRENT_TIME_SEC;
                inode_dec_link_count(new_inode);
        }
-        mark_buffer_dirty(old_bh);
+        mark_buffer_dirty_inode(old_bh, old_dir);
        error = 0;
 end_rename:
@@ -320,7 +320,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
                                for (i = 0; i < BFS_NAMELEN; i++)
                                        de->name[i] =
                                                (i < namelen) ? name[i] : 0;
-                                mark_buffer_dirty(bh);
+                                mark_buffer_dirty_inode(bh, dir);
                                brelse(bh);
                                return 0;
                        }
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index cc4062d12ca2..6f60336c6628 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,6 +30,7 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
+static void bfs_write_super(struct super_block *s);
 void dump_imap(const char *prefix, struct super_block *s);
 struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -97,14 +98,15 @@ error:
        return ERR_PTR(-EIO);
 }
-static int bfs_write_inode(struct inode *inode, int unused)
+static int bfs_write_inode(struct inode *inode, int wait)
 {
+        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
        unsigned int ino = (u16)inode->i_ino;
        unsigned long i_sblock;
        struct bfs_inode *di;
        struct buffer_head *bh;
        int block, off;
-        struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+        int err = 0;
        dprintf("ino=%08x\n", ino);
@@ -145,9 +147,14 @@ static int bfs_write_inode(struct inode *inode, int unused)
        di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
        mark_buffer_dirty(bh);
+        if (wait) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh))
+                        err = -EIO;
+        }
        brelse(bh);
        mutex_unlock(&info->bfs_lock);
-        return 0;
+        return err;
 }
 static void bfs_delete_inode(struct inode *inode)
@@ -209,6 +216,26 @@ static void bfs_delete_inode(struct inode *inode)
        clear_inode(inode);
 }
+static int bfs_sync_fs(struct super_block *sb, int wait)
+{
+        struct bfs_sb_info *info = BFS_SB(sb);
+        mutex_lock(&info->bfs_lock);
+        mark_buffer_dirty(info->si_sbh);
+        sb->s_dirt = 0;
+        mutex_unlock(&info->bfs_lock);
+        return 0;
+}
+static void bfs_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                bfs_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
+}
 static void bfs_put_super(struct super_block *s)
 {
        struct bfs_sb_info *info = BFS_SB(s);
@@ -216,11 +243,18 @@ static void bfs_put_super(struct super_block *s)
        if (!info)
                return;
+        lock_kernel();
+        if (s->s_dirt)
+                bfs_write_super(s);
        brelse(info->si_sbh);
        mutex_destroy(&info->bfs_lock);
        kfree(info->si_imap);
        kfree(info);
        s->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -240,17 +274,6 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
-static void bfs_write_super(struct super_block *s)
-{
-        struct bfs_sb_info *info = BFS_SB(s);
-        mutex_lock(&info->bfs_lock);
-        if (!(s->s_flags & MS_RDONLY))
-                mark_buffer_dirty(info->si_sbh);
-        s->s_dirt = 0;
-        mutex_unlock(&info->bfs_lock);
-}
 static struct kmem_cache *bfs_inode_cachep;
 static struct inode *bfs_alloc_inode(struct super_block *sb)
@@ -298,6 +321,7 @@ static const struct super_operations bfs_sops = {
        .delete_inode   = bfs_delete_inode,
        .put_super      = bfs_put_super,
        .write_super    = bfs_write_super,
+        .sync_fs        = bfs_sync_fs,
        .statfs         = bfs_statfs,
 };
diff --git a/fs/bio.c b/fs/bio.c
index 98711647ece4..5f80848c320c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 /*
 * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -359,9 +358,9 @@ static void bio_kmalloc_destructor(struct bio *bio)
 *
 *   If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
 *   a bio. This is due to the mempool guarantees. To make this work, callers
- *   must never allocate more than 1 bio at the time from this pool. Callers
+ *   must never allocate more than 1 bio at a time from this pool. Callers
 *   that need to allocate more than 1 bio must always submit the previously
- *   allocate bio for IO before attempting to allocate a new one. Failure to
+ *   allocated bio for IO before attempting to allocate a new one. Failure to
 *   do so can cause livelocks under memory pressure.
 *
 **/
@@ -499,11 +498,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
        struct request_queue *q = bdev_get_queue(bdev);
        int nr_pages;
-        nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        if (nr_pages > q->max_phys_segments)
+        if (nr_pages > queue_max_phys_segments(q))
-                nr_pages = q->max_phys_segments;
+                nr_pages = queue_max_phys_segments(q);
-        if (nr_pages > q->max_hw_segments)
+        if (nr_pages > queue_max_hw_segments(q))
-                nr_pages = q->max_hw_segments;
+                nr_pages = queue_max_hw_segments(q);
        return nr_pages;
 }
@@ -562,8 +561,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         * make this too complex.
         */
-        while (bio->bi_phys_segments >= q->max_phys_segments
+        while (bio->bi_phys_segments >= queue_max_phys_segments(q)
-               || bio->bi_phys_segments >= q->max_hw_segments) {
+               || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
                if (retried_segments)
                        return 0;
@@ -634,7 +633,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
                    unsigned int len, unsigned int offset)
 {
-        return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
+        return __bio_add_page(q, bio, page, len, offset,
+                              queue_max_hw_sectors(q));
 }
 /**
@@ -654,7 +654,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
                 unsigned int offset)
 {
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-        return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
+        return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
 }
 struct bio_map_data {
@@ -721,7 +721,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
                while (bv_len && iov_idx < iov_count) {
                        unsigned int bytes;
-                        char *iov_addr;
+                        char __user *iov_addr;
                        bytes = min_t(unsigned int,
                                      iov[iov_idx].iov_len - iov_off, bv_len);
@@ -1201,7 +1201,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
                char *addr = page_address(bvec->bv_page);
                int len = bmd->iovecs[i].bv_len;
-                if (read && !err)
+                if (read)
                        memcpy(p, addr, len);
                __free_page(bvec->bv_page);
@@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 sector_t bio_sector_offset(struct bio *bio, unsigned short index,
                           unsigned int offset)
 {
-        unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+        unsigned int sector_sz;
        struct bio_vec *bv;
        sector_t sectors;
        int i;
+        sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
        sectors = 0;
        if (index >= bio->bi_idx)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd17..3a6d4fb2a329 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
+#include <linux/kmemleak.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -76,7 +77,7 @@ int set_blocksize(struct block_device *bdev, int size)
                return -EINVAL;
        /* Size cannot be smaller than the size supported by the device */
-        if (size < bdev_hardsect_size(bdev))
+        if (size < bdev_logical_block_size(bdev))
                return -EINVAL;
        /* Don't change the size if it is same as current */
@@ -106,7 +107,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
 int sb_min_blocksize(struct super_block *sb, int size)
 {
-        int minsize = bdev_hardsect_size(sb->s_bdev);
+        int minsize = bdev_logical_block_size(sb->s_bdev);
        if (size < minsize)
                size = minsize;
        return sb_set_blocksize(sb, size);
@@ -175,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                                iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+        if (!bdev)
+                return 0;
+        if (!wait)
+                return filemap_flush(bdev->bd_inode->i_mapping);
+        return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
 /*
 * Write out and wait upon all the dirty data associated with a block
 * device via its mapping.  Does not take the superblock lock.
 */
 int sync_blockdev(struct block_device *bdev)
 {
-        int ret = 0;
+        return __sync_blockdev(bdev, 1);
-        if (bdev)
-                ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-        return ret;
 }
 EXPORT_SYMBOL(sync_blockdev);
@@ -198,7 +204,7 @@ int fsync_bdev(struct block_device *bdev)
 {
        struct super_block *sb = get_super(bdev);
        if (sb) {
-                int res = fsync_super(sb);
+                int res = sync_filesystem(sb);
                drop_super(sb);
                return res;
        }
@@ -240,7 +246,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
                sb->s_frozen = SB_FREEZE_WRITE;
                smp_wmb();
-                __fsync_super(sb);
+                sync_filesystem(sb);
                sb->s_frozen = SB_FREEZE_TRANS;
                smp_wmb();
@@ -492,6 +498,11 @@ void __init bdev_cache_init(void)
        bd_mnt = kern_mount(&bd_type);
        if (IS_ERR(bd_mnt))
                panic("Cannot create bdev pseudo-fs");
+        /*
+         * This vfsmount structure is only used to obtain the
+         * blockdev_superblock, so tell kmemleak not to report it.
+         */
+        kmemleak_not_leak(bd_mnt);
        blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
 }
@@ -1111,7 +1122,7 @@ EXPORT_SYMBOL(check_disk_change);
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-        unsigned bsize = bdev_hardsect_size(bdev);
+        unsigned bsize = bdev_logical_block_size(bdev);
        bdev->bd_inode->i_size = size;
        while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b68330f8585..8612b3a09811 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2322,7 +2322,6 @@ err:
        btrfs_update_inode(trans, root, dir);
        btrfs_drop_nlink(inode);
        ret = btrfs_update_inode(trans, root, inode);
-        dir->i_sb->s_dirt = 1;
 out:
        return ret;
 }
@@ -2806,7 +2805,6 @@ error:
                                      pending_del_nr);
        }
        btrfs_free_path(path);
-        inode->i_sb->s_dirt = 1;
        return ret;
 }
@@ -3768,7 +3766,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                init_special_inode(inode, inode->i_mode, rdev);
                btrfs_update_inode(trans, root, inode);
        }
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3833,7 +3830,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3880,7 +3876,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (err)
                drop_inode = 1;
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, dir);
        err = btrfs_update_inode(trans, root, inode);
@@ -3962,7 +3957,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        drop_on_err = 0;
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
@@ -4991,7 +4985,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        dir->i_sb->s_dirt = 1;
        btrfs_update_inode_block_group(trans, inode);
        btrfs_update_inode_block_group(trans, dir);
        if (drop_inode)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 708ac06b953b..9f179d4832d5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -394,10 +394,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        struct btrfs_root *root = btrfs_sb(sb);
        int ret;
-        if (sb->s_flags & MS_RDONLY)
-                return 0;
-        sb->s_dirt = 0;
        if (!wait) {
                filemap_flush(root->fs_info->btree_inode->i_mapping);
                return 0;
@@ -408,7 +404,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
-        sb->s_dirt = 0;
        return ret;
 }
@@ -454,11 +449,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        return 0;
 }
-static void btrfs_write_super(struct super_block *sb)
-{
-        sb->s_dirt = 0;
-}
 static int btrfs_test_super(struct super_block *s, void *data)
 {
        struct btrfs_fs_devices *test_fs_devices = data;
@@ -689,7 +679,6 @@ static int btrfs_unfreeze(struct super_block *sb)
 static struct super_operations btrfs_super_ops = {
        .delete_inode   = btrfs_delete_inode,
        .put_super      = btrfs_put_super,
-        .write_super    = btrfs_write_super,
        .sync_fs        = btrfs_sync_fs,
        .show_options   = btrfs_show_options,
        .write_inode    = btrfs_write_inode,
diff --git a/fs/buffer.c b/fs/buffer.c
index 49106127a4aa..a3ef091a45bd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
        /* Size must be multiple of hard sectorsize */
-        if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
                                        size);
-                printk(KERN_ERR "hardsect size: %d\n",
+                printk(KERN_ERR "logical block size: %d\n",
-                                        bdev_hardsect_size(bdev));
+                                        bdev_logical_block_size(bdev));
                dump_stack();
                return NULL;
@@ -2935,6 +2935,8 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
+        BUG_ON(buffer_delay(bh));
+        BUG_ON(buffer_unwritten(bh));
        /*
         * Mask in barrier bit for a write (could be either a WRITE or a
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1e962348d111..431accd475a7 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,9 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
        /* make sure all pages pinned by operations on behalf of the netfs are
         * written to disc */
        cachefiles_begin_secure(cache, &saved_cred);
-        ret = fsync_super(cache->mnt->mnt_sb);
+        down_read(&cache->mnt->mnt_sb->s_umount);
+        ret = sync_filesystem(cache->mnt->mnt_sb);
+        up_read(&cache->mnt->mnt_sb->s_umount);
        cachefiles_end_secure(cache, saved_cred);
        if (ret == -EIO)
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 38f71222a552..b7c9d5187a75 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -375,7 +375,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
-                        inode->i_cindex = idx;
                        list_add(&inode->i_devices, &p->list);
                        new = NULL;
                } else if (!cdev_get(p))
@@ -405,6 +404,18 @@ static int chrdev_open(struct inode *inode, struct file *filp)
        return ret;
 }
+int cdev_index(struct inode *inode)
+{
+        int idx;
+        struct kobject *kobj;
+        kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
+        if (!kobj)
+                return -1;
+        kobject_put(kobj);
+        return idx;
+}
 void cd_forget(struct inode *inode)
 {
        spin_lock(&cdev_lock);
@@ -557,6 +568,7 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(register_chrdev);
 EXPORT_SYMBOL(unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f20c4069c220..b48689839428 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,12 @@
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it).  Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden).
 Version 1.58
 ------------
 Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
@@ -10,6 +19,8 @@ we converted from).  Fix endianness of the vcnum field used during
 session setup to distinguish multiple mounts to same server from different
 userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
 flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
 
 Version 1.57
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index db208ddb9899..ad92921dbde4 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,7 +262,8 @@ A partial list of the supported mount options follows:
                mount.  
  domain        Set the SMB/CIFS workgroup name prepended to the
                username during CIFS session establishment
-  uid           Set the default uid for inodes. For mounts to servers
+  forceuid      Set the default uid for inodes based on the uid
+                passed in. For mounts to servers
                which do support the CIFS Unix extensions, such as a
                properly configured Samba server, the server provides
                the uid, gid and mode so this parameter should  not be
@@ -292,6 +293,12 @@ A partial list of the supported mount options follows:
                the client.  Note that the mount.cifs helper must be
                at version 1.10 or higher to support specifying the uid
                (or gid) in non-numeric form.
+  forcegid      (similar to above but for the groupid instead of uid)
+  uid           Set the default uid for inodes, and indicate to the
+                cifs kernel driver which local user mounted . If the server
+                supports the unix extensions the default uid is
+                not used to fill in the owner fields of inodes (files)
+                unless the "forceuid" parameter is specified.
  gid           Set the default gid for inodes (similar to above).
  file_mode     If CIFS Unix extensions are not supported by the server
                this overrides the default mode for file inodes.
@@ -388,8 +395,13 @@ A partial list of the supported mount options follows:
                or the CIFS Unix Extensions equivalent and for those
                this mount option will have no effect.  Exporting cifs mounts
                under nfsd requires this mount option on the cifs mount.
+                This is now the default if server supports the 
+                required network operation.
  noserverino   Client generates inode numbers (rather than using the actual one
-                from the server) by default.
+                from the server). These inode numbers will vary after
+                unmount or reboot which can confuse some applications,
+                but not all server filesystems support unique inode
+                numbers.
  setuids       If the CIFS Unix extensions are negotiated with the server
                the client will attempt to set the effective uid and gid of
                the local process on newly created files, directories, and
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 83d62759c7c7..3bb11be8b6a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -275,7 +275,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
        case -EBUSY:
                /* someone else made a mount here whilst we were busy */
                while (d_mountpoint(nd->path.dentry) &&
-                       follow_down(&nd->path.mnt, &nd->path.dentry))
+                       follow_down(&nd->path))
                        ;
                err = 0;
        default:
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 67bf93a40d2e..4a4581cb2b5e 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
+#include <linux/inet.h>
 #include "cifsglob.h"
 #include "cifs_spnego.h"
 #include "cifs_debug.h"
@@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = {
 * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN        13
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
-#define MAX_IPV6_ADDR_LEN       43
 /* strlen of "host=" */
 #define HOST_KEY_LEN            5
@@ -102,7 +100,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
           host=hostname sec=mechanism uid=0xFF user=username */
        desc_len = MAX_VER_STR_LEN +
                   HOST_KEY_LEN + strlen(hostname) +
-                   IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
+                   IP_KEY_LEN + INET6_ADDRSTRLEN +
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
                   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 57ecdc83c26f..1403b5d86a73 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -552,130 +552,138 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
        return rc;
 }
+static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
-/* Retrieve an ACL from the server */
+                __u16 fid, u32 *pacllen)
-static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
-                                       const char *path, const __u16 *pfid)
 {
-        struct cifsFileInfo *open_file = NULL;
-        bool unlock_file = false;
-        int xid;
-        int rc = -EIO;
-        __u16 fid;
-        struct super_block *sb;
-        struct cifs_sb_info *cifs_sb;
        struct cifs_ntsd *pntsd = NULL;
+        int xid, rc;
+        xid = GetXid();
+        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+        FreeXid(xid);
-        cFYI(1, ("get mode from ACL for %s", path));
-        if (inode == NULL)
+        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-                return NULL;
+        return pntsd;
+}
+static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
+                const char *path, u32 *pacllen)
+{
+        struct cifs_ntsd *pntsd = NULL;
+        int oplock = 0;
+        int xid, rc;
+        __u16 fid;
        xid = GetXid();
-        if (pfid == NULL)
-                open_file = find_readable_file(CIFS_I(inode));
-        else
-                fid = *pfid;
-        sb = inode->i_sb;
+        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
-        if (sb == NULL) {
+                         &fid, &oplock, NULL, cifs_sb->local_nls,
-                FreeXid(xid);
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-                return NULL;
+        if (rc) {
-        }
+                cERROR(1, ("Unable to open file to get ACL"));
-        cifs_sb = CIFS_SB(sb);
+                goto out;
-        if (open_file) {
-                unlock_file = true;
-                fid = open_file->netfid;
-        } else if (pfid == NULL) {
-                int oplock = 0;
-                /* open file */
-                rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-                                READ_CONTROL, 0, &fid, &oplock, NULL,
-                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc != 0) {
-                        cERROR(1, ("Unable to open file to get ACL"));
-                        FreeXid(xid);
-                        return NULL;
-                }
        }
        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-        if (unlock_file == true) /* find_readable_file increments ref count */
-                atomic_dec(&open_file->wrtPending);
-        else if (pfid == NULL) /* if opened above we have to close the handle */
-                CIFSSMBClose(xid, cifs_sb->tcon, fid);
-        /* else handle was passed in by caller */
+        CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
        FreeXid(xid);
        return pntsd;
 }
-/* Set an ACL on the server */
+/* Retrieve an ACL from the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
-                                struct inode *inode, const char *path)
+                                      struct inode *inode, const char *path,
+                                      u32 *pacllen)
 {
-        struct cifsFileInfo *open_file;
+        struct cifs_ntsd *pntsd = NULL;
-        bool unlock_file = false;
+        struct cifsFileInfo *open_file = NULL;
-        int xid;
-        int rc = -EIO;
-        __u16 fid;
-        struct super_block *sb;
-        struct cifs_sb_info *cifs_sb;
-        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+        if (inode)
+                open_file = find_readable_file(CIFS_I(inode));
+        if (!open_file)
+                return get_cifs_acl_by_path(cifs_sb, path, pacllen);
-        if (!inode)
+        pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
-                return rc;
+        atomic_dec(&open_file->wrtPending);
+        return pntsd;
+}
-        sb = inode->i_sb;
+static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
-        if (sb == NULL)
+                struct cifs_ntsd *pnntsd, u32 acllen)
-                return rc;
+{
+        int xid, rc;
-        cifs_sb = CIFS_SB(sb);
        xid = GetXid();
+        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+        FreeXid(xid);
-        open_file = find_readable_file(CIFS_I(inode));
+        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-        if (open_file) {
+        return rc;
-                unlock_file = true;
+}
-                fid = open_file->netfid;
-        } else {
+static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
-                int oplock = 0;
+                struct cifs_ntsd *pnntsd, u32 acllen)
-                /* open file */
+{
-                rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
+        int oplock = 0;
-                                WRITE_DAC, 0, &fid, &oplock, NULL,
+        int xid, rc;
-                                cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+        __u16 fid;
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc != 0) {
+        xid = GetXid();
-                        cERROR(1, ("Unable to open file to set ACL"));
-                        FreeXid(xid);
+        rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
-                        return rc;
+                         &fid, &oplock, NULL, cifs_sb->local_nls,
-                }
+                         cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc) {
+                cERROR(1, ("Unable to open file to set ACL"));
+                goto out;
        }
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-        if (unlock_file)
-                atomic_dec(&open_file->wrtPending);
-        else
-                CIFSSMBClose(xid, cifs_sb->tcon, fid);
+        CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
        FreeXid(xid);
+        return rc;
+}
+/* Set an ACL on the server */
+static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+                                struct inode *inode, const char *path)
+{
+        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+        struct cifsFileInfo *open_file;
+        int rc;
+        cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+        open_file = find_readable_file(CIFS_I(inode));
+        if (!open_file)
+                return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
+        rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
+        atomic_dec(&open_file->wrtPending);
        return rc;
 }
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid)
+void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+                     const char *path, const __u16 *pfid)
 {
        struct cifs_ntsd *pntsd = NULL;
        u32 acllen = 0;
        int rc = 0;
        cFYI(DBG2, ("converting ACL to mode for %s", path));
-        pntsd = get_cifs_acl(&acllen, inode, path, pfid);
+        if (pfid)
+                pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
+        else
+                pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
        if (pntsd)
@@ -698,7 +706,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        cFYI(DBG2, ("set ACL from mode for %s", path));
        /* Get the security descriptor */
-        pntsd = get_cifs_acl(&secdesclen, inode, path, NULL);
+        pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
        /* Add three ACEs for owner, group, everyone getting rid of
           other ACEs as chmod disables ACEs and set the security descriptor */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5e6d35804d73..0d92114195ab 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #endif
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
-        inode = cifs_iget(sb, ROOT_I);
+        inode = cifs_root_iget(sb, ROOT_I);
        if (IS_ERR(inode)) {
                rc = PTR_ERR(inode);
@@ -204,6 +204,9 @@ cifs_put_super(struct super_block *sb)
                cFYI(1, ("Empty cifs superblock info passed to unmount"));
                return;
        }
+        lock_kernel();
        rc = cifs_umount(sb, cifs_sb);
        if (rc)
                cERROR(1, ("cifs_umount failed with return code %d", rc));
@@ -216,7 +219,8 @@ cifs_put_super(struct super_block *sb)
        unload_nls(cifs_sb->local_nls);
        kfree(cifs_sb);
-        return;
+        unlock_kernel();
 }
 static int
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 051b71cfdea9..9570a0e8023f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern void cifs_read_inode(struct inode *);
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
 extern int cifs_create(struct inode *, struct dentry *, int,
                       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.58"
+#define CIFS_VERSION   "1.59"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fae083930eee..f9452329bcce 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -90,10 +90,10 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
                                                 struct cifsTconInfo *);
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
-extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601);
+extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
-extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
-extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+                                      int offset);
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
                           struct super_block *sb, int mode, int oflags,
@@ -108,8 +108,8 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void acl_to_uid_mode(struct inode *inode, const char *path,
+extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
-                            const __u16 *pfid);
+                            const char *path, const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d06260251c30..b84c61d5bca4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        int val, seconds, remain, result;
                        struct timespec ts, utc;
                        utc = CURRENT_TIME;
-                        ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
+                        ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
-                                                le16_to_cpu(rsp->SrvTime.Time));
+                                            rsp->SrvTime.Time, 0);
                        cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
                                (int)ts.tv_sec, (int)utc.tv_sec,
                                (int)(utc.tv_sec - ts.tv_sec)));
@@ -2427,8 +2427,7 @@ querySymLinkRetry:
        params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
        pSMB->TotalDataCount = 0;
        pSMB->MaxParameterCount = cpu_to_le16(2);
-        /* BB find exact max data count below from sess structure BB */
+        pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
-        pSMB->MaxDataCount = cpu_to_le16(4000);
        pSMB->MaxSetupCount = 0;
        pSMB->Reserved = 0;
        pSMB->Flags = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4aa81a507b74..97f4311b9a8e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <linux/inet.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -61,7 +62,6 @@ struct smb_vol {
        char *domainname;
        char *UNC;
        char *UNCip;
-        char *in6_addr;   /* ipv6 address as human readable form of in6_addr */
        char *iocharset;  /* local code page for mapping to and from Unicode */
        char source_rfc1001_name[16]; /* netbios name of client */
        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -827,14 +827,16 @@ cifs_parse_mount_options(char *options, const char *devname,
        vol->target_rfc1001_name[0] = 0;
        vol->linux_uid = current_uid();  /* use current_euid() instead? */
        vol->linux_gid = current_gid();
-        vol->dir_mode = S_IRWXUGO;
-        /* 2767 perms indicate mandatory locking support */
+        /* default to only allowing write access to owner of the mount */
-        vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
+        vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
        /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
        vol->rw = true;
        /* default is always to request posix paths. */
        vol->posix_paths = 1;
+        /* default to using server inode numbers where available */
+        vol->server_ino = 1;
        if (!options)
                return 1;
@@ -955,10 +957,12 @@ cifs_parse_mount_options(char *options, const char *devname,
                                }
                                strcpy(vol->password, value);
                        }
-                } else if (strnicmp(data, "ip", 2) == 0) {
+                } else if (!strnicmp(data, "ip", 2) ||
+                           !strnicmp(data, "addr", 4)) {
                        if (!value || !*value) {
                                vol->UNCip = NULL;
-                        } else if (strnlen(value, 35) < 35) {
+                        } else if (strnlen(value, INET6_ADDRSTRLEN) <
+                                                        INET6_ADDRSTRLEN) {
                                vol->UNCip = value;
                        } else {
                                printk(KERN_WARNING "CIFS: ip address "
@@ -1092,17 +1096,17 @@ cifs_parse_mount_options(char *options, const char *devname,
                                return 1;
                        }
                } else if (strnicmp(data, "uid", 3) == 0) {
-                        if (value && *value) {
+                        if (value && *value)
                                vol->linux_uid =
                                        simple_strtoul(value, &value, 0);
+                } else if (strnicmp(data, "forceuid", 8) == 0) {
                                vol->override_uid = 1;
-                        }
                } else if (strnicmp(data, "gid", 3) == 0) {
-                        if (value && *value) {
+                        if (value && *value)
                                vol->linux_gid =
                                        simple_strtoul(value, &value, 0);
+                } else if (strnicmp(data, "forcegid", 8) == 0) {
                                vol->override_gid = 1;
-                        }
                } else if (strnicmp(data, "file_mode", 4) == 0) {
                        if (value && *value) {
                                vol->file_mode =
@@ -1315,16 +1319,6 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->direct_io = 1;
                } else if (strnicmp(data, "forcedirectio", 13) == 0) {
                        vol->direct_io = 1;
-                } else if (strnicmp(data, "in6_addr", 8) == 0) {
-                        if (!value || !*value) {
-                                vol->in6_addr = NULL;
-                        } else if (strnlen(value, 49) == 48) {
-                                vol->in6_addr = value;
-                        } else {
-                                printk(KERN_WARNING "CIFS: ip v6 address not "
-                                                    "48 characters long\n");
-                                return 1;
-                        }
                } else if (strnicmp(data, "noac", 4) == 0) {
                        printk(KERN_WARNING "CIFS: Mount option noac not "
                                "supported. Instead set "
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 302ea15f02e6..06866841b97f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
        /* BB need same check in cifs_create too? */
        /* if not oplocked, invalidate inode pages if mtime or file
           size changed */
-        temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+        temp = cifs_NTtimeToUnix(buf->LastWriteTime);
        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
                           (file->f_path.dentry->d_inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c869a6dcba1..fad882b075ba 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -85,10 +85,10 @@ static void cifs_unix_info_to_inode(struct inode *inode,
        __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
        __u64 end_of_file = le64_to_cpu(info->EndOfFile);
-        inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime));
+        inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime);
        inode->i_mtime =
-                cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime));
+                cifs_NTtimeToUnix(info->LastModificationTime);
-        inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange));
+        inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
        inode->i_mode = le64_to_cpu(info->Permissions);
        /*
@@ -554,14 +554,11 @@ int cifs_get_inode_info(struct inode **pinode,
        /* Linux can not store file creation time so ignore it */
        if (pfindData->LastAccessTime)
-                inode->i_atime = cifs_NTtimeToUnix
+                inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime);
-                        (le64_to_cpu(pfindData->LastAccessTime));
        else /* do not need to use current_fs_time - time not stored */
                inode->i_atime = CURRENT_TIME;
-        inode->i_mtime =
+        inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime);
-                    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+        inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime);
-        inode->i_ctime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
        cFYI(DBG2, ("Attributes came in as 0x%x", attr));
        if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
                inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
@@ -629,7 +626,7 @@ int cifs_get_inode_info(struct inode **pinode,
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
                cFYI(1, ("Getting mode bits from ACL"));
-                acl_to_uid_mode(inode, full_path, pfid);
+                acl_to_uid_mode(cifs_sb, inode, full_path, pfid);
        }
 #endif
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
@@ -699,7 +696,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 }
 /* gets root inode */
-struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
        int xid;
        struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index e2fe998989a3..32d6baa0a54f 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -853,12 +853,12 @@ smbCalcSize_LE(struct smb_hdr *ptr)
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
-    /*
+/*
-     * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
-     * into Unix UTC (based 1970-01-01, in seconds).
+ * into Unix UTC (based 1970-01-01, in seconds).
-     */
+ */
 struct timespec
-cifs_NTtimeToUnix(u64 ntutc)
+cifs_NTtimeToUnix(__le64 ntutc)
 {
        struct timespec ts;
        /* BB what about the timezone? BB */
@@ -866,7 +866,7 @@ cifs_NTtimeToUnix(u64 ntutc)
        /* Subtract the NTFS time offset, then convert to 1s intervals. */
        u64 t;
-        t = ntutc - NTFS_TIME_OFFSET;
+        t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
        ts.tv_nsec = do_div(t, 10000000) * 100;
        ts.tv_sec = t;
        return ts;
@@ -883,16 +883,12 @@ cifs_UnixTimeToNT(struct timespec t)
 static int total_days_of_prev_months[] =
 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
+struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
-__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
-{
-        return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
-}
-struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
 {
        struct timespec ts;
        int sec, min, days, month, year;
+        u16 date = le16_to_cpu(le_date);
+        u16 time = le16_to_cpu(le_time);
        SMB_TIME *st = (SMB_TIME *)&time;
        SMB_DATE *sd = (SMB_DATE *)&date;
@@ -933,7 +929,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
                days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
        sec += 24 * 60 * 60 * days;
-        ts.tv_sec = sec;
+        ts.tv_sec = sec + offset;
        /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 964e097c8203..86d0055dc529 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -115,17 +115,6 @@ construct_dentry(struct qstr *qstring, struct file *file,
        return rc;
 }
-static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode)
-{
-        if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
-                inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
-                inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
-                inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
-        }
-        return;
-}
 static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                          char *buf, unsigned int *pobject_type, int isNewInode)
 {
@@ -150,26 +139,25 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                allocation_size = le64_to_cpu(pfindData->AllocationSize);
                end_of_file = le64_to_cpu(pfindData->EndOfFile);
                tmp_inode->i_atime =
-                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+                        cifs_NTtimeToUnix(pfindData->LastAccessTime);
                tmp_inode->i_mtime =
-                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+                        cifs_NTtimeToUnix(pfindData->LastWriteTime);
                tmp_inode->i_ctime =
-                      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+                        cifs_NTtimeToUnix(pfindData->ChangeTime);
        } else { /* legacy, OS2 and DOS style */
-/*              struct timespec ts;*/
+                int offset = cifs_sb->tcon->ses->server->timeAdj;
                FIND_FILE_STANDARD_INFO *pfindData =
                        (FIND_FILE_STANDARD_INFO *)buf;
-                tmp_inode->i_mtime = cnvrtDosUnixTm(
+                tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate,
-                                le16_to_cpu(pfindData->LastWriteDate),
+                                                    pfindData->LastWriteTime,
-                                le16_to_cpu(pfindData->LastWriteTime));
+                                                    offset);
-                tmp_inode->i_atime = cnvrtDosUnixTm(
+                tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate,
-                                le16_to_cpu(pfindData->LastAccessDate),
+                                                    pfindData->LastAccessTime,
-                                le16_to_cpu(pfindData->LastAccessTime));
+                                                    offset);
-                tmp_inode->i_ctime = cnvrtDosUnixTm(
+                tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate,
-                                le16_to_cpu(pfindData->LastWriteDate),
+                                                    pfindData->LastWriteTime,
-                                le16_to_cpu(pfindData->LastWriteTime));
+                                                    offset);
-                AdjustForTZ(cifs_sb->tcon, tmp_inode);
                attr = le16_to_cpu(pfindData->Attributes);
                allocation_size = le32_to_cpu(pfindData->AllocationSize);
                end_of_file = le32_to_cpu(pfindData->DataSize);
@@ -331,11 +319,11 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
        local_size  = tmp_inode->i_size;
        tmp_inode->i_atime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+            cifs_NTtimeToUnix(pfindData->LastAccessTime);
        tmp_inode->i_mtime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime));
+            cifs_NTtimeToUnix(pfindData->LastModificationTime);
        tmp_inode->i_ctime =
-            cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange));
+            cifs_NTtimeToUnix(pfindData->LastStatusChange);
        tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
        /* since we set the inode type below we need to mask off type
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6a347fbc998a..ffd42815fda1 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
                      struct pipe_inode_info *pipe, size_t count,
                      unsigned int flags)
 {
+        ssize_t (*splice_read)(struct file *, loff_t *,
+                               struct pipe_inode_info *, size_t, unsigned int);
        struct coda_file_info *cfi;
        struct file *host_file;
@@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
        host_file = cfi->cfi_container;
-        if (!host_file->f_op || !host_file->f_op->splice_read)
+        splice_read = host_file->f_op->splice_read;
-                return -EINVAL;
+        if (!splice_read)
+                splice_read = default_file_splice_read;
-        return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags);
+        return splice_read(host_file, ppos, pipe, count, flags);
 }
 static ssize_t
diff --git a/fs/compat.c b/fs/compat.c
index 681ed81e6be0..cdd51a3a7c53 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -471,7 +471,7 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
                ret = sys_fcntl(fd, cmd, (unsigned long)&f);
                set_fs(old_fs);
                if (cmd == F_GETLK && ret == 0) {
-                        /* GETLK was successfule and we need to return the data...
+                        /* GETLK was successful and we need to return the data...
                         * but it needs to fit in the compat structure.
                         * l_start shouldn't be too big, unless the original
                         * start + end is greater than COMPAT_OFF_T_MAX, in which
@@ -812,10 +812,8 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
                }
        }
-        lock_kernel();
        retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
                        flags, (void*)data_page);
-        unlock_kernel();
 out4:
        free_page(data_page);
@@ -1488,7 +1486,7 @@ int compat_do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+        retval = mutex_lock_interruptible(&current->cred_guard_mutex);
        if (retval < 0)
                goto out_free;
        current->in_execve = 1;
@@ -1550,7 +1548,7 @@ int compat_do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1573,7 +1571,7 @@ out_unmark:
 out_unlock:
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
 out_free:
        free_bprm(bprm);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index b83f6bcfa51a..0aac371bff0b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1765,7 +1765,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
 /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
 * for some operations; this forces use of the newer bridge-utils that
- * use compatiable ioctls
+ * use compatible ioctls
 */
 static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 762d287123ca..da6061a6df40 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -39,6 +39,9 @@ struct configfs_dirent {
        umode_t                 s_mode;
        struct dentry           * s_dentry;
        struct iattr            * s_iattr;
+#ifdef CONFIG_LOCKDEP
+        int                     s_depth;
+#endif
 };
 #define CONFIGFS_ROOT           0x0001
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 05373db21a4e..8e48b52205aa 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -78,11 +78,97 @@ static const struct dentry_operations configfs_dentry_ops = {
        .d_delete       = configfs_d_delete,
 };
+#ifdef CONFIG_LOCKDEP
+/*
+ * Helpers to make lockdep happy with our recursive locking of default groups'
+ * inodes (see configfs_attach_group() and configfs_detach_group()).
+ * We put default groups i_mutexes in separate classes according to their depth
+ * from the youngest non-default group ancestor.
+ *
+ * For a non-default group A having default groups A/B, A/C, and A/C/D, default
+ * groups A/B and A/C will have their inode's mutex in class
+ * default_group_class[0], and default group A/C/D will be in
+ * default_group_class[1].
+ *
+ * The lock classes are declared and assigned in inode.c, according to the
+ * s_depth value.
+ * The s_depth value is initialized to -1, adjusted to >= 0 when attaching
+ * default groups, and reset to -1 when all default groups are attached. During
+ * attachment, if configfs_create() sees s_depth > 0, the lock class of the new
+ * inode's mutex is set to default_group_class[s_depth - 1].
+ */
+static void configfs_init_dirent_depth(struct configfs_dirent *sd)
+{
+        sd->s_depth = -1;
+}
+static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
+                                          struct configfs_dirent *sd)
+{
+        int parent_depth = parent_sd->s_depth;
+        if (parent_depth >= 0)
+                sd->s_depth = parent_depth + 1;
+}
+static void
+configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
+{
+        /*
+         * item's i_mutex class is already setup, so s_depth is now only
+         * used to set new sub-directories s_depth, which is always done
+         * with item's i_mutex locked.
+         */
+        /*
+         *  sd->s_depth == -1 iff we are a non default group.
+         *  else (we are a default group) sd->s_depth > 0 (see
+         *  create_dir()).
+         */
+        if (sd->s_depth == -1)
+                /*
+                 * We are a non default group and we are going to create
+                 * default groups.
+                 */
+                sd->s_depth = 0;
+}
+static void
+configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
+{
+        /* We will not create default groups anymore. */
+        sd->s_depth = -1;
+}
+#else /* CONFIG_LOCKDEP */
+static void configfs_init_dirent_depth(struct configfs_dirent *sd)
+{
+}
+static void configfs_set_dir_dirent_depth(struct configfs_dirent *parent_sd,
+                                          struct configfs_dirent *sd)
+{
+}
+static void
+configfs_adjust_dir_dirent_depth_before_populate(struct configfs_dirent *sd)
+{
+}
+static void
+configfs_adjust_dir_dirent_depth_after_populate(struct configfs_dirent *sd)
+{
+}
+#endif /* CONFIG_LOCKDEP */
 /*
 * Allocates a new configfs_dirent and links it to the parent configfs_dirent
 */
-static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd,
+static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent *parent_sd,
-                                                void * element)
+                                                   void *element, int type)
 {
        struct configfs_dirent * sd;
@@ -94,6 +180,8 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
        INIT_LIST_HEAD(&sd->s_links);
        INIT_LIST_HEAD(&sd->s_children);
        sd->s_element = element;
+        sd->s_type = type;
+        configfs_init_dirent_depth(sd);
        spin_lock(&configfs_dirent_lock);
        if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
                spin_unlock(&configfs_dirent_lock);
@@ -138,12 +226,11 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
 {
        struct configfs_dirent * sd;
-        sd = configfs_new_dirent(parent_sd, element);
+        sd = configfs_new_dirent(parent_sd, element, type);
        if (IS_ERR(sd))
                return PTR_ERR(sd);
        sd->s_mode = mode;
-        sd->s_type = type;
        sd->s_dentry = dentry;
        if (dentry) {
                dentry->d_fsdata = configfs_get(sd);
@@ -187,6 +274,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
                error = configfs_make_dirent(p->d_fsdata, d, k, mode,
                                             CONFIGFS_DIR | CONFIGFS_USET_CREATING);
        if (!error) {
+                configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata);
                error = configfs_create(d, mode, init_dir);
                if (!error) {
                        inc_nlink(p->d_inode);
@@ -789,11 +877,13 @@ static int configfs_attach_group(struct config_item *parent_item,
                 * error, as rmdir() would.
                 */
                mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+                configfs_adjust_dir_dirent_depth_before_populate(sd);
                ret = populate_groups(to_config_group(item));
                if (ret) {
                        configfs_detach_item(item);
                        dentry->d_inode->i_flags |= S_DEAD;
                }
+                configfs_adjust_dir_dirent_depth_after_populate(sd);
                mutex_unlock(&dentry->d_inode->i_mutex);
                if (ret)
                        d_delete(dentry);
@@ -916,11 +1006,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
 * Note, btw, that this can be called at *any* time, even when a configfs
 * subsystem isn't registered, or when configfs is loading or unloading.
 * Just like configfs_register_subsystem().  So we take the same
- * precautions.  We pin the filesystem.  We lock each i_mutex _in_order_
+ * precautions.  We pin the filesystem.  We lock configfs_dirent_lock.
- * on our way down the tree.  If we can find the target item in the
+ * If we can find the target item in the
 * configfs tree, it must be part of the subsystem tree as well, so we
- * do not need the subsystem semaphore.  Holding the i_mutex chain locks
+ * do not need the subsystem semaphore.  Holding configfs_dirent_lock helps
- * out mkdir() and rmdir(), who might be racing us.
+ * locking out mkdir() and rmdir(), who might be racing us.
 */
 /*
@@ -933,17 +1023,21 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
 * do that so we can unlock it if we find nothing.
 *
 * Here we do a depth-first search of the dentry hierarchy looking for
- * our object.  We take i_mutex on each step of the way down.  IT IS
+ * our object.
- * ESSENTIAL THAT i_mutex LOCKING IS ORDERED.  If we come back up a branch,
+ * We deliberately ignore items tagged as dropping since they are virtually
- * we'll drop the i_mutex.
+ * dead, as well as items in the middle of attachment since they virtually
+ * do not exist yet. This completes the locking out of racing mkdir() and
+ * rmdir().
+ * Note: subdirectories in the middle of attachment start with s_type =
+ * CONFIGFS_DIR|CONFIGFS_USET_CREATING set by create_dir().  When
+ * CONFIGFS_USET_CREATING is set, we ignore the item.  The actual set of
+ * s_type is in configfs_new_dirent(), which has configfs_dirent_lock.
 *
- * If the target is not found, -ENOENT is bubbled up and we have released
+ * If the target is not found, -ENOENT is bubbled up.
- * all locks.  If the target was found, the locks will be cleared by
- * configfs_depend_rollback().
 *
 * This adds a requirement that all config_items be unique!
 *
- * This is recursive because the locking traversal is tricky.  There isn't
+ * This is recursive.  There isn't
 * much on the stack, though, so folks that need this function - be careful
 * about your stack!  Patches will be accepted to make it iterative.
 */
@@ -955,13 +1049,13 @@ static int configfs_depend_prep(struct dentry *origin,
        BUG_ON(!origin || !sd);
-        /* Lock this guy on the way down */
-        mutex_lock(&sd->s_dentry->d_inode->i_mutex);
        if (sd->s_element == target)  /* Boo-yah */
                goto out;
        list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
-                if (child_sd->s_type & CONFIGFS_DIR) {
+                if ((child_sd->s_type & CONFIGFS_DIR) &&
+                    !(child_sd->s_type & CONFIGFS_USET_DROPPING) &&
+                    !(child_sd->s_type & CONFIGFS_USET_CREATING)) {
                        ret = configfs_depend_prep(child_sd->s_dentry,
                                                   target);
                        if (!ret)
@@ -970,33 +1064,12 @@ static int configfs_depend_prep(struct dentry *origin,
        }
        /* We looped all our children and didn't find target */
-        mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
        ret = -ENOENT;
 out:
        return ret;
 }
-/*
- * This is ONLY called if configfs_depend_prep() did its job.  So we can
- * trust the entire path from item back up to origin.
- *
- * We walk backwards from item, unlocking each i_mutex.  We finish by
- * unlocking origin.
- */
-static void configfs_depend_rollback(struct dentry *origin,
-                                     struct config_item *item)
-{
-        struct dentry *dentry = item->ci_dentry;
-        while (dentry != origin) {
-                mutex_unlock(&dentry->d_inode->i_mutex);
-                dentry = dentry->d_parent;
-        }
-        mutex_unlock(&origin->d_inode->i_mutex);
-}
 int configfs_depend_item(struct configfs_subsystem *subsys,
                         struct config_item *target)
 {
@@ -1037,17 +1110,21 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
        /* Ok, now we can trust subsys/s_item */
-        /* Scan the tree, locking i_mutex recursively, return 0 if found */
+        spin_lock(&configfs_dirent_lock);
+        /* Scan the tree, return 0 if found */
        ret = configfs_depend_prep(subsys_sd->s_dentry, target);
        if (ret)
-                goto out_unlock_fs;
+                goto out_unlock_dirent_lock;
-        /* We hold all i_mutexes from the subsystem down to the target */
+        /*
+         * We are sure that the item is not about to be removed by rmdir(), and
+         * not in the middle of attachment by mkdir().
+         */
        p = target->ci_dentry->d_fsdata;
        p->s_dependent_count += 1;
-        configfs_depend_rollback(subsys_sd->s_dentry, target);
+out_unlock_dirent_lock:
+        spin_unlock(&configfs_dirent_lock);
 out_unlock_fs:
        mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
@@ -1072,10 +1149,10 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
        struct configfs_dirent *sd;
        /*
-         * Since we can trust everything is pinned, we just need i_mutex
+         * Since we can trust everything is pinned, we just need
-         * on the item.
+         * configfs_dirent_lock.
         */
-        mutex_lock(&target->ci_dentry->d_inode->i_mutex);
+        spin_lock(&configfs_dirent_lock);
        sd = target->ci_dentry->d_fsdata;
        BUG_ON(sd->s_dependent_count < 1);
@@ -1086,7 +1163,7 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
         * After this unlock, we cannot trust the item to stay alive!
         * DO NOT REFERENCE item after this unlock.
         */
-        mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
+        spin_unlock(&configfs_dirent_lock);
 }
 EXPORT_SYMBOL(configfs_undepend_item);
@@ -1286,13 +1363,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (sd->s_type & CONFIGFS_USET_DEFAULT)
                return -EPERM;
-        /*
-         * Here's where we check for dependents.  We're protected by
-         * i_mutex.
-         */
-        if (sd->s_dependent_count)
-                return -EBUSY;
        /* Get a working ref until we have the child */
        parent_item = configfs_get_config_item(dentry->d_parent);
        subsys = to_config_group(parent_item)->cg_subsys;
@@ -1316,9 +1386,17 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
                mutex_lock(&configfs_symlink_mutex);
                spin_lock(&configfs_dirent_lock);
-                ret = configfs_detach_prep(dentry, &wait_mutex);
+                /*
-                if (ret)
+                 * Here's where we check for dependents.  We're protected by
-                        configfs_detach_rollback(dentry);
+                 * configfs_dirent_lock.
+                 * If no dependent, atomically tag the item as dropping.
+                 */
+                ret = sd->s_dependent_count ? -EBUSY : 0;
+                if (!ret) {
+                        ret = configfs_detach_prep(dentry, &wait_mutex);
+                        if (ret)
+                                configfs_detach_rollback(dentry);
+                }
                spin_unlock(&configfs_dirent_lock);
                mutex_unlock(&configfs_symlink_mutex);
@@ -1429,7 +1507,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
         */
        err = -ENOENT;
        if (configfs_dirent_is_ready(parent_sd)) {
-                file->private_data = configfs_new_dirent(parent_sd, NULL);
+                file->private_data = configfs_new_dirent(parent_sd, NULL, 0);
                if (IS_ERR(file->private_data))
                        err = PTR_ERR(file->private_data);
                else
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5d349d38e056..4921e7426d95 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -33,10 +33,15 @@
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
 #include <linux/sched.h>
+#include <linux/lockdep.h>
 #include <linux/configfs.h>
 #include "configfs_internal.h"
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key default_group_class[MAX_LOCK_DEPTH];
+#endif
 extern struct super_block * configfs_sb;
 static const struct address_space_operations configfs_aops = {
@@ -150,6 +155,38 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
        return inode;
 }
+#ifdef CONFIG_LOCKDEP
+static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
+                                          struct inode *inode)
+{
+        int depth = sd->s_depth;
+        if (depth > 0) {
+                if (depth <= ARRAY_SIZE(default_group_class)) {
+                        lockdep_set_class(&inode->i_mutex,
+                                          &default_group_class[depth - 1]);
+                } else {
+                        /*
+                         * In practice the maximum level of locking depth is
+                         * already reached. Just inform about possible reasons.
+                         */
+                        printk(KERN_INFO "configfs: Too many levels of inodes"
+                               " for the locking correctness validator.\n");
+                        printk(KERN_INFO "Spurious warnings may appear.\n");
+                }
+        }
+}
+#else /* CONFIG_LOCKDEP */
+static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
+                                          struct inode *inode)
+{
+}
+#endif /* CONFIG_LOCKDEP */
 int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
 {
        int error = 0;
@@ -162,6 +199,7 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *
                                        struct inode *p_inode = dentry->d_parent->d_inode;
                                        p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
                                }
+                                configfs_set_inode_lock_class(sd, inode);
                                goto Proceed;
                        }
                        else
diff --git a/fs/dcache.c b/fs/dcache.c
index 75659a6fd1f8..9e5cd3c3a6ba 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root,
        spin_lock(&vfsmount_lock);
        prepend(&end, &buflen, "\0", 1);
-        if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+        if (d_unlinked(dentry) &&
                (prepend(&end, &buflen, " (deleted)", 10) != 0))
                        goto Elong;
@@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
        spin_lock(&dcache_lock);
        prepend(&end, &buflen, "\0", 1);
-        if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+        if (d_unlinked(dentry) &&
                (prepend(&end, &buflen, "//deleted", 9) != 0))
                        goto Elong;
        if (buflen < 1)
@@ -2097,9 +2097,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        read_unlock(&current->fs->lock);
        error = -ENOENT;
-        /* Has the current directory has been unlinked? */
        spin_lock(&dcache_lock);
-        if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
+        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
                char * cwd;
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c68edb969441..9b1d285f9fe6 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -557,8 +557,10 @@ static int __init init_devpts_fs(void)
        int err = register_filesystem(&devpts_fs_type);
        if (!err) {
                devpts_mnt = kern_mount(&devpts_fs_type);
-                if (IS_ERR(devpts_mnt))
+                if (IS_ERR(devpts_mnt)) {
                        err = PTR_ERR(devpts_mnt);
+                        unregister_filesystem(&devpts_fs_type);
+                }
        }
        return err;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bbc2050..8b10b87dc01a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                rw = WRITE_ODIRECT;
        if (bdev)
-                bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+                bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
        if (offset & blocksize_mask) {
                if (bdev)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 858fba14aaa6..c4dfa1dcc86f 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,7 +49,8 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
        spin_unlock(&ls->ls_recover_list_lock);
        if (!found)
-                de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_KERNEL);
+                de = kzalloc(sizeof(struct dlm_direntry) + len,
+                             ls->ls_allocation);
        return de;
 }
@@ -211,7 +212,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
        dlm_dir_clear(ls);
-        last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+        last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
        if (!last_name)
                goto out;
@@ -322,7 +323,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
        if (namelen > DLM_RESNAME_MAXLEN)
                return -EINVAL;
-        de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_KERNEL);
+        de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
        if (!de)
                return -ENOMEM;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index cd8e2df3c295..d489fcc86713 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -384,7 +384,7 @@ static void threads_stop(void)
        dlm_astd_stop();
 }
-static int new_lockspace(char *name, int namelen, void **lockspace,
+static int new_lockspace(const char *name, int namelen, void **lockspace,
                         uint32_t flags, int lvblen)
 {
        struct dlm_ls *ls;
@@ -419,16 +419,14 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
                        break;
                }
                ls->ls_create_count++;
-                module_put(THIS_MODULE);
+                *lockspace = ls;
-                error = 1; /* not an error, return 0 */
+                error = 1;
                break;
        }
        spin_unlock(&lslist_lock);
-        if (error < 0)
-                goto out;
        if (error)
-                goto ret_zero;
+                goto out;
        error = -ENOMEM;
@@ -583,7 +581,6 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        dlm_create_debug_file(ls);
        log_debug(ls, "join complete");
- ret_zero:
        *lockspace = ls;
        return 0;
@@ -614,7 +611,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
        return error;
 }
-int dlm_new_lockspace(char *name, int namelen, void **lockspace,
+int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
                      uint32_t flags, int lvblen)
 {
        int error = 0;
@@ -628,7 +625,9 @@ int dlm_new_lockspace(char *name, int namelen, void **lockspace,
        error = new_lockspace(name, namelen, lockspace, flags, lvblen);
        if (!error)
                ls_count++;
-        else if (!ls_count)
+        if (error > 0)
+                error = 0;
+        if (!ls_count)
                threads_stop();
 out:
        mutex_unlock(&ls_lock);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 609108a83267..cdb580a9c7a2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -309,6 +309,20 @@ static void lowcomms_state_change(struct sock *sk)
                lowcomms_write_space(sk);
 }
+int dlm_lowcomms_connect_node(int nodeid)
+{
+        struct connection *con;
+        if (nodeid == dlm_our_nodeid())
+                return 0;
+        con = nodeid2con(nodeid, GFP_NOFS);
+        if (!con)
+                return -ENOMEM;
+        lowcomms_connect_sock(con);
+        return 0;
+}
 /* Make a socket active */
 static int add_sock(struct socket *sock, struct connection *con)
 {
@@ -486,7 +500,7 @@ static void process_sctp_notification(struct connection *con,
                                return;
                        }
-                        new_con = nodeid2con(nodeid, GFP_KERNEL);
+                        new_con = nodeid2con(nodeid, GFP_NOFS);
                        if (!new_con)
                                return;
@@ -722,7 +736,7 @@ static int tcp_accept_from_sock(struct connection *con)
         *  the same time and the connections cross on the wire.
         *  In this case we store the incoming one in "othercon"
         */
-        newcon = nodeid2con(nodeid, GFP_KERNEL);
+        newcon = nodeid2con(nodeid, GFP_NOFS);
        if (!newcon) {
                result = -ENOMEM;
                goto accept_err;
@@ -732,7 +746,7 @@ static int tcp_accept_from_sock(struct connection *con)
                struct connection *othercon = newcon->othercon;
                if (!othercon) {
-                        othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
+                        othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
                        if (!othercon) {
                                log_print("failed to allocate incoming socket");
                                mutex_unlock(&newcon->sock_mutex);
@@ -1421,7 +1435,7 @@ static int work_start(void)
 static void stop_conn(struct connection *con)
 {
        con->flags |= 0x0F;
-        if (con->sock)
+        if (con->sock && con->sock->sk)
                con->sock->sk->sk_user_data = NULL;
 }
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
index a9a9618c0d3f..1311e6426287 100644
--- a/fs/dlm/lowcomms.h
+++ b/fs/dlm/lowcomms.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,7 @@ void dlm_lowcomms_stop(void);
 int dlm_lowcomms_close(int nodeid);
 void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
 void dlm_lowcomms_commit_buffer(void *mh);
+int dlm_lowcomms_connect_node(int nodeid);
 #endif                          /* __LOWCOMMS_DOT_H__ */
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 26133f05ae3a..b128775913b2 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
 #include "recover.h"
 #include "rcom.h"
 #include "config.h"
+#include "lowcomms.h"
 static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
 {
@@ -45,9 +46,9 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
 static int dlm_add_member(struct dlm_ls *ls, int nodeid)
 {
        struct dlm_member *memb;
-        int w;
+        int w, error;
-        memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+        memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
        if (!memb)
                return -ENOMEM;
@@ -57,6 +58,12 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
                return w;
        }
+        error = dlm_lowcomms_connect_node(nodeid);
+        if (error < 0) {
+                kfree(memb);
+                return error;
+        }
        memb->nodeid = nodeid;
        memb->weight = w;
        add_ordered_member(ls, memb);
@@ -136,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
        ls->ls_total_weight = total;
-        array = kmalloc(sizeof(int) * total, GFP_KERNEL);
+        array = kmalloc(sizeof(int) * total, ls->ls_allocation);
        if (!array)
                return;
@@ -219,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
                        continue;
                log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
-                memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+                memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
                if (!memb)
                        return -ENOMEM;
                memb->nodeid = rv->new[i];
@@ -334,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
        int *ids = NULL, *new = NULL;
        int error, ids_count = 0, new_count = 0;
-        rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
+        rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
        if (!rv)
                return -ENOMEM;
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index daa4183fbb84..7a2307c08911 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
        struct rq_entry *e;
        int length = ms->m_header.h_length - sizeof(struct dlm_message);
-        e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+        e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
        if (!e) {
                log_print("dlm_add_requestqueue: out of memory len %d", length);
                return;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index fa4c7e7d15d9..12d649602d3a 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/key.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
@@ -120,9 +121,13 @@ static void ecryptfs_put_super(struct super_block *sb)
 {
        struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+        lock_kernel();
        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
        ecryptfs_set_superblock_private(sb, NULL);
+        unlock_kernel();
 }
 /**
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 2a701d593d35..3f0e1974abdc 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,6 +16,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/eventfd.h>
 #include <linux/syscalls.h>
+#include <linux/module.h>
 struct eventfd_ctx {
        wait_queue_head_t wqh;
@@ -56,6 +57,7 @@ int eventfd_signal(struct file *file, int n)
        return n;
 }
+EXPORT_SYMBOL_GPL(eventfd_signal);
 static int eventfd_release(struct inode *inode, struct file *file)
 {
@@ -197,6 +199,7 @@ struct file *eventfd_fget(int fd)
        return file;
 }
+EXPORT_SYMBOL_GPL(eventfd_fget);
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
diff --git a/fs/exec.c b/fs/exec.c
index 895823d0149d..e639957d7a57 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/perf_counter.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
@@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
        task_lock(tsk);
        strlcpy(tsk->comm, buf, sizeof(tsk->comm));
        task_unlock(tsk);
+        perf_counter_comm(tsk);
 }
 int flush_old_exec(struct linux_binprm * bprm)
@@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)
        current->personality &= ~bprm->per_clear;
+        /*
+         * Flush performance counters when crossing a
+         * security domain:
+         */
+        if (!get_dumpable(current->mm))
+                perf_counter_exit_task(current);
        /* An exec changes our domain. We are no longer part of the thread
           group */
@@ -1016,7 +1025,7 @@ void install_exec_creds(struct linux_binprm *bprm)
        commit_creds(bprm->cred);
        bprm->cred = NULL;
-        /* cred_exec_mutex must be held at least to this point to prevent
+        /* cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
         * credentials; any time after this it may be unlocked */
@@ -1026,7 +1035,7 @@ EXPORT_SYMBOL(install_exec_creds);
 /*
 * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_exec_mutex to protect against
+ * - the caller must hold current->cred_guard_mutex to protect against
 *   PTRACE_ATTACH
 */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1268,7 +1277,7 @@ int do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+        retval = mutex_lock_interruptible(&current->cred_guard_mutex);
        if (retval < 0)
                goto out_free;
        current->in_execve = 1;
@@ -1331,7 +1340,7 @@ int do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1354,7 +1363,7 @@ out_unmark:
 out_unlock:
        current->in_execve = 0;
-        mutex_unlock(&current->cred_exec_mutex);
+        mutex_unlock(&current->cred_guard_mutex);
 out_free:
        free_bprm(bprm);
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index b1512c4bb8c7..24667eedc023 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -175,10 +175,4 @@ int exofs_async_op(struct osd_request *or,
 int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
-int osd_req_read_kern(struct osd_request *or,
-        const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
-int osd_req_write_kern(struct osd_request *or,
-        const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
 #endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index ba8d9fab4693..77d0a295eb1c 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -59,10 +59,9 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
                struct inode *inode)
 {
        struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
-        struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
        pcol->sbi = sbi;
-        pcol->req_q = req_q;
+        pcol->req_q = osd_request_queue(sbi->s_dev);
        pcol->inode = inode;
        pcol->expected_pages = expected_pages;
@@ -266,7 +265,7 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
                goto err;
        }
-        osd_req_read(or, &obj, pcol->bio, i_start);
+        osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
        if (is_sync) {
                exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
@@ -522,7 +521,8 @@ static int write_exec(struct page_collect *pcol)
        *pcol_copy = *pcol;
-        osd_req_write(or, &obj, pcol_copy->bio, i_start);
+        pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
+        osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length);
        ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
        if (unlikely(ret)) {
                EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b249ae97fb15..b3d2ccb87aaa 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -50,10 +50,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
        /* FIXME: should be include in osd_sense_info */
        if (in_resid)
-                *in_resid = or->in.req ? or->in.req->data_len : 0;
+                *in_resid = or->in.req ? or->in.req->resid_len : 0;
        if (out_resid)
-                *out_resid = or->out.req ? or->out.req->data_len : 0;
+                *out_resid = or->out.req ? or->out.req->resid_len : 0;
        return ret;
 }
@@ -125,29 +125,3 @@ int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
        return -EIO;
 }
-int osd_req_read_kern(struct osd_request *or,
-        const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
-{
-        struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
-        struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
-        if (!bio)
-                return -ENOMEM;
-        osd_req_read(or, obj, bio, offset);
-        return 0;
-}
-int osd_req_write_kern(struct osd_request *or,
-        const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
-{
-        struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
-        struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
-        if (!bio)
-                return -ENOMEM;
-        osd_req_write(or, obj, bio, offset);
-        return 0;
-}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f1985e857e2..8216c5b77b53 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -200,20 +200,21 @@ static const struct export_operations exofs_export_ops;
 /*
 * Write the superblock to the OSD
 */
-static void exofs_write_super(struct super_block *sb)
+static int exofs_sync_fs(struct super_block *sb, int wait)
 {
        struct exofs_sb_info *sbi;
        struct exofs_fscb *fscb;
        struct osd_request *or;
        struct osd_obj_id obj;
-        int ret;
+        int ret = -ENOMEM;
        fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
        if (!fscb) {
                EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-                return;
+                return -ENOMEM;
        }
+        lock_super(sb);
        lock_kernel();
        sbi = sb->s_fs_info;
        fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -246,7 +247,17 @@ out:
        if (or)
                osd_end_request(or);
        unlock_kernel();
+        unlock_super(sb);
        kfree(fscb);
+        return ret;
+}
+static void exofs_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                exofs_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 /*
@@ -258,6 +269,11 @@ static void exofs_put_super(struct super_block *sb)
        int num_pend;
        struct exofs_sb_info *sbi = sb->s_fs_info;
+        lock_kernel();
+        if (sb->s_dirt)
+                exofs_write_super(sb);
        /* make sure there are no pending commands */
        for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
             num_pend = atomic_read(&sbi->s_curr_pending)) {
@@ -271,6 +287,8 @@ static void exofs_put_super(struct super_block *sb)
        osduld_put_device(sbi->s_dev);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 /*
@@ -484,6 +502,7 @@ static const struct super_operations exofs_sops = {
        .delete_inode   = exofs_delete_inode,
        .put_super      = exofs_put_super,
        .write_super    = exofs_write_super,
+        .sync_fs        = exofs_sync_fs,
        .statfs         = exofs_statfs,
 };
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index e0b2b43c1fdb..f42af45cfd88 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 obj-$(CONFIG_EXT2_FS) += ext2.o
-ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
          ioctl.o namei.o super.o symlink.o
 ext2-$(CONFIG_EXT2_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2999d72153b7..003500498c22 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -720,5 +720,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-        .fsync          = ext2_sync_file,
+        .fsync          = simple_fsync,
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3203042b36ef..f2e5811936d0 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -27,7 +27,7 @@ struct ext2_inode_info {
        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
-         * it is ued for making block allocation decisions - we try to
+         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
@@ -113,9 +113,6 @@ extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
-/* fsync.c */
-extern int ext2_sync_file (struct file *, struct dentry *, int);
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, int);
 extern void ext2_free_inode (struct inode *);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 45ed07122182..2b9e47dc9222 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = {
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,
        .release        = ext2_release_file,
-        .fsync          = ext2_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
 };
@@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = {
        .mmap           = xip_file_mmap,
        .open           = generic_file_open,
        .release        = ext2_release_file,
-        .fsync          = ext2_sync_file,
+        .fsync          = simple_fsync,
 };
 #endif
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
deleted file mode 100644
index fc66c93fcb5c..000000000000
--- a/fs/ext2/fsync.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  linux/fs/ext2/fsync.c
- *
- *  Copyright (C) 1993  Stephen Tweedie (sct@dcs.ed.ac.uk)
- *  from
- *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
- *                      Laboratoire MASI - Institut Blaise Pascal
- *                      Universite Pierre et Marie Curie (Paris VI)
- *  from
- *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
- * 
- *  ext2fs fsync primitive
- *
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- * 
- *  Removed unnecessary code duplication for little endian machines
- *  and excessive __inline__s. 
- *        Andi Kleen, 1997
- *
- * Major simplications and cleanup - we only need to do the metadata, because
- * we can depend on generic_block_fdatasync() to sync the data blocks.
- */
-#include "ext2.h"
-#include <linux/buffer_head.h>          /* for sync_mapping_buffers() */
-/*
- *      File may be NULL when we are called. Perhaps we shouldn't
- *      even pass file to fsync ?
- */
-int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        int ret;
-        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return ret;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return ret;
-        err = ext2_sync_inode(inode);
-        if (ret == 0)
-                ret = err;
-        return ret;
-}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index acf678831103..29ed682061f6 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,8 +41,6 @@ MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
-static int ext2_update_inode(struct inode * inode, int do_sync);
 /*
 * Test whether an inode is a fast symlink.
 */
@@ -66,7 +64,7 @@ void ext2_delete_inode (struct inode * inode)
                goto no_delete;
        EXT2_I(inode)->i_dtime  = get_seconds();
        mark_inode_dirty(inode);
-        ext2_update_inode(inode, inode_needs_sync(inode));
+        ext2_write_inode(inode, inode_needs_sync(inode));
        inode->i_size = 0;
        if (inode->i_blocks)
@@ -1337,7 +1335,7 @@ bad_inode:
        return ERR_PTR(ret);
 }
-static int ext2_update_inode(struct inode * inode, int do_sync)
+int ext2_write_inode(struct inode *inode, int do_sync)
 {
        struct ext2_inode_info *ei = EXT2_I(inode);
        struct super_block *sb = inode->i_sb;
@@ -1442,11 +1440,6 @@ static int ext2_update_inode(struct inode * inode, int do_sync)
        return err;
 }
-int ext2_write_inode(struct inode *inode, int wait)
-{
-        return ext2_update_inode(inode, wait);
-}
 int ext2_sync_inode(struct inode *inode)
 {
        struct writeback_control wbc = {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5c4afe652245..458999638c3d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,7 @@ static void ext2_sync_super(struct super_block *sb,
                            struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext2_sync_fs(struct super_block *sb, int wait);
 void ext2_error (struct super_block * sb, const char * function,
                 const char * fmt, ...)
@@ -114,6 +115,11 @@ static void ext2_put_super (struct super_block * sb)
        int i;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                ext2_write_super(sb);
        ext2_xattr_put_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                struct ext2_super_block *es = sbi->s_es;
@@ -135,7 +141,7 @@ static void ext2_put_super (struct super_block * sb)
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache * ext2_inode_cachep;
@@ -304,6 +310,7 @@ static const struct super_operations ext2_sops = {
        .delete_inode   = ext2_delete_inode,
        .put_super      = ext2_put_super,
        .write_super    = ext2_write_super,
+        .sync_fs        = ext2_sync_fs,
        .statfs         = ext2_statfs,
        .remount_fs     = ext2_remount,
        .clear_inode    = ext2_clear_inode,
@@ -1093,6 +1100,7 @@ failed_mount:
        brelse(bh);
 failed_sbi:
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return ret;
 }
@@ -1126,25 +1134,36 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 * set s_state to EXT2_VALID_FS after some corrections.
 */
-void ext2_write_super (struct super_block * sb)
+static int ext2_sync_fs(struct super_block *sb, int wait)
 {
-        struct ext2_super_block * es;
+        struct ext2_super_block *es = EXT2_SB(sb)->s_es;
        lock_kernel();
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
-                es = EXT2_SB(sb)->s_es;
+                ext2_debug("setting valid to 0\n");
+                es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-                if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
+                es->s_free_blocks_count =
-                        ext2_debug ("setting valid to 0\n");
+                        cpu_to_le32(ext2_count_free_blocks(sb));
-                        es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
+                es->s_free_inodes_count =
-                        es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
+                        cpu_to_le32(ext2_count_free_inodes(sb));
-                        es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
+                es->s_mtime = cpu_to_le32(get_seconds());
-                        es->s_mtime = cpu_to_le32(get_seconds());
+                ext2_sync_super(sb, es);
-                        ext2_sync_super(sb, es);
+        } else {
-                } else
+                ext2_commit_super(sb, es);
-                        ext2_commit_super (sb, es);
        }
        sb->s_dirt = 0;
        unlock_kernel();
+        return 0;
+}
+void ext2_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                ext2_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static int ext2_remount (struct super_block * sb, int * flags, char * data)
@@ -1156,6 +1175,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        unsigned long old_sb_flags;
        int err;
+        lock_kernel();
        /* Store the old options */
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -1191,12 +1212,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+                unlock_kernel();
                return 0;
+        }
        if (*flags & MS_RDONLY) {
                if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
-                    !(sbi->s_mount_state & EXT2_VALID_FS))
+                    !(sbi->s_mount_state & EXT2_VALID_FS)) {
+                        unlock_kernel();
                        return 0;
+                }
                /*
                 * OK, we are remounting a valid rw partition rdonly, so set
                 * the rdonly flag and then mark the partition as valid again.
@@ -1223,12 +1248,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                        sb->s_flags &= ~MS_RDONLY;
        }
        ext2_sync_super(sb, es);
+        unlock_kernel();
        return 0;
 restore_opts:
        sbi->s_mount_opt = old_opts.s_mount_opt;
        sbi->s_resuid = old_opts.s_resuid;
        sbi->s_resgid = old_opts.s_resgid;
        sb->s_flags = old_sb_flags;
+        unlock_kernel();
        return err;
 }
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 225202db8974..27967f92e820 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -649,7 +649,7 @@ do_more:
                count = overflow;
                goto do_more;
        }
-        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
        ext3_std_error(sb, err);
@@ -1708,7 +1708,6 @@ allocated:
        if (!fatal)
                fatal = err;
-        sb->s_dirt = 1;
        if (fatal)
                goto out;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dd13d60d524b..b39991285136 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
        err = ext3_journal_dirty_metadata(handle, bitmap_bh);
        if (!fatal)
                fatal = err;
-        sb->s_dirt = 1;
 error_return:
        brelse(bitmap_bh);
        ext3_std_error(sb, fatal);
@@ -537,7 +537,6 @@ got:
        percpu_counter_dec(&sbi->s_freeinodes_counter);
        if (S_ISDIR(mode))
                percpu_counter_inc(&sbi->s_dirs_counter);
-        sb->s_dirt = 1;
        inode->i_uid = current_fsuid();
        if (test_opt (sb, GRPID))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index fcfa24361856..b0248c6d5d4c 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2960,7 +2960,6 @@ static int ext3_do_update_inode(handle_t *handle,
                                ext3_update_dynamic_rev(sb);
                                EXT3_SET_RO_COMPAT_FEATURE(sb,
                                        EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
-                                sb->s_dirt = 1;
                                handle->h_sync = 1;
                                err = ext3_journal_dirty_metadata(handle,
                                                EXT3_SB(sb)->s_sbh);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 78fdf3836370..8a0b26340b54 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -934,7 +934,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
                           EXT3_INODES_PER_GROUP(sb));
        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
-        sb->s_dirt = 1;
 exit_journal:
        unlock_super(sb);
@@ -1066,7 +1065,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-        sb->s_dirt = 1;
        unlock_super(sb);
        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
                   o_blocks_count + add);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c3..26aa64dee6aa 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -67,7 +67,6 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext3_unfreeze(struct super_block *sb);
-static void ext3_write_super (struct super_block * sb);
 static int ext3_freeze(struct super_block *sb);
 /*
@@ -399,6 +398,8 @@ static void ext3_put_super (struct super_block * sb)
        struct ext3_super_block *es = sbi->s_es;
        int i, err;
+        lock_kernel();
        ext3_xattr_put_super(sb);
        err = journal_destroy(sbi->s_journal);
        sbi->s_journal = NULL;
@@ -447,7 +448,8 @@ static void ext3_put_super (struct super_block * sb)
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache *ext3_inode_cachep;
@@ -761,7 +763,6 @@ static const struct super_operations ext3_sops = {
        .dirty_inode    = ext3_dirty_inode,
        .delete_inode   = ext3_delete_inode,
        .put_super      = ext3_put_super,
-        .write_super    = ext3_write_super,
        .sync_fs        = ext3_sync_fs,
        .freeze_fs      = ext3_freeze,
        .unfreeze_fs    = ext3_unfreeze,
@@ -1696,7 +1697,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        hblock = bdev_hardsect_size(sb->s_bdev);
+        hblock = bdev_logical_block_size(sb->s_bdev);
        if (sb->s_blocksize != blocksize) {
                /*
                 * Make sure the blocksize for the filesystem is larger
@@ -1785,7 +1786,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 #else
                es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
-                sb->s_dirt = 1;
        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
@@ -2021,6 +2021,7 @@ failed_mount:
        brelse(bh);
 out_fail:
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
        return ret;
@@ -2119,7 +2120,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
        }
        blocksize = sb->s_blocksize;
-        hblock = bdev_hardsect_size(bdev);
+        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
                printk(KERN_ERR
                        "EXT3-fs: blocksize too small for journal device.\n");
@@ -2264,7 +2265,6 @@ static int ext3_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
-                sb->s_dirt = 1;
                /* Make sure we flush the recovery flag to disk. */
                ext3_commit_super(sb, es, 1);
@@ -2307,7 +2307,6 @@ static int ext3_create_journal(struct super_block * sb,
        EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
        es->s_journal_inum = cpu_to_le32(journal_inum);
-        sb->s_dirt = 1;
        /* Make sure we flush the recovery flag to disk. */
        ext3_commit_super(sb, es, 1);
@@ -2353,7 +2352,6 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
        if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-                sb->s_dirt = 0;
                ext3_commit_super(sb, es, 1);
        }
        unlock_super(sb);
@@ -2412,29 +2410,14 @@ int ext3_force_commit(struct super_block *sb)
                return 0;
        journal = EXT3_SB(sb)->s_journal;
-        sb->s_dirt = 0;
        ret = ext3_journal_force_commit(journal);
        return ret;
 }
-/*
- * Ext3 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
- */
-static void ext3_write_super (struct super_block * sb)
-{
-        if (mutex_trylock(&sb->s_lock) != 0)
-                BUG();
-        sb->s_dirt = 0;
-}
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
        tid_t target;
-        sb->s_dirt = 0;
        if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
                if (wait)
                        log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2450,7 +2433,6 @@ static int ext3_freeze(struct super_block *sb)
 {
        int error = 0;
        journal_t *journal;
-        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY)) {
                journal = EXT3_SB(sb)->s_journal;
@@ -2508,7 +2490,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        int i;
 #endif
+        lock_kernel();
        /* Store the original options */
+        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_resuid = sbi->s_resuid;
@@ -2616,6 +2601,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
                        kfree(old_opts.s_qf_names[i]);
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
@@ -2632,6 +2619,8 @@ restore_opts:
                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
        }
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return err;
 }
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 83b7be849bd5..545e37c4b91e 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -463,7 +463,6 @@ static void ext3_xattr_update_super_block(handle_t *handle,
        if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
                EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
-                sb->s_dirt = 1;
                ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
        }
 }
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a8ff003a00f7..8a34710ecf40 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,8 +5,8 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
 ext4-y  := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-                   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                   ext4_jbd2.o migrate.o mballoc.o
+                ext4_jbd2.o migrate.o mballoc.o block_validity.o
 ext4-$(CONFIG_EXT4_FS_XATTR)            += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad85877..e2126d70dff5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -19,7 +19,6 @@
 #include <linux/buffer_head.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "group.h"
 #include "mballoc.h"
 /*
@@ -88,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
        int bit, bit_max;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        unsigned free_blocks, group_blocks;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -123,7 +123,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                bit_max += ext4_bg_num_gdb(sb, block_group);
        }
-        if (block_group == sbi->s_groups_count - 1) {
+        if (block_group == ngroups - 1) {
                /*
                 * Even though mke2fs always initialize first and last group
                 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +131,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 */
                group_blocks = ext4_blocks_count(sbi->s_es) -
                        le32_to_cpu(sbi->s_es->s_first_data_block) -
-                        (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+                        (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
        } else {
                group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
        }
@@ -205,18 +205,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 {
        unsigned int group_desc;
        unsigned int offset;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (block_group >= sbi->s_groups_count) {
+        if (block_group >= ngroups) {
                ext4_error(sb, "ext4_get_group_desc",
                           "block_group >= groups_count - "
                           "block_group = %u, groups_count = %u",
-                           block_group, sbi->s_groups_count);
+                           block_group, ngroups);
                return NULL;
        }
-        smp_rmb();
        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
-        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
-        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        down_write(&grp->alloc_sem);
        for (i = 0, blocks_freed = 0; i < count; i++) {
                BUFFER_TRACE(bitmap_bh, "clear bit");
-                if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+                if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
                                                bit + i, bitmap_bh->b_data)) {
                        ext4_error(sb, __func__,
                                   "bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                        blocks_freed++;
                }
        }
-        spin_lock(sb_bgl_lock(sbi, block_group));
+        ext4_lock_group(sb, block_group);
        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
        ext4_free_blks_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-        spin_unlock(sb_bgl_lock(sbi, block_group));
+        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
        if (sbi->s_log_groups_per_flex) {
@@ -665,7 +665,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
-        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
@@ -677,7 +677,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        bitmap_count = 0;
        gdp = NULL;
-        smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
@@ -700,7 +699,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        return bitmap_count;
 #else
        desc_count = 0;
-        smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 000000000000..50784ef07563
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,244 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4.h"
+struct ext4_system_zone {
+        struct rb_node  node;
+        ext4_fsblk_t    start_blk;
+        unsigned int    count;
+};
+static struct kmem_cache *ext4_system_zone_cachep;
+int __init init_ext4_system_zone(void)
+{
+        ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+                                             SLAB_RECLAIM_ACCOUNT);
+        if (ext4_system_zone_cachep == NULL)
+                return -ENOMEM;
+        return 0;
+}
+void exit_ext4_system_zone(void)
+{
+        kmem_cache_destroy(ext4_system_zone_cachep);
+}
+static inline int can_merge(struct ext4_system_zone *entry1,
+                     struct ext4_system_zone *entry2)
+{
+        if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+                return 1;
+        return 0;
+}
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+                           ext4_fsblk_t start_blk,
+                           unsigned int count)
+{
+        struct ext4_system_zone *new_entry = NULL, *entry;
+        struct rb_node **n = &sbi->system_blks.rb_node, *node;
+        struct rb_node *parent = NULL, *new_node = NULL;
+        while (*n) {
+                parent = *n;
+                entry = rb_entry(parent, struct ext4_system_zone, node);
+                if (start_blk < entry->start_blk)
+                        n = &(*n)->rb_left;
+                else if (start_blk >= (entry->start_blk + entry->count))
+                        n = &(*n)->rb_right;
+                else {
+                        if (start_blk + count > (entry->start_blk + 
+                                                 entry->count))
+                                entry->count = (start_blk + count - 
+                                                entry->start_blk);
+                        new_node = *n;
+                        new_entry = rb_entry(new_node, struct ext4_system_zone,
+                                             node);
+                        break;
+                }
+        }
+        if (!new_entry) {
+                new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+                                             GFP_KERNEL);
+                if (!new_entry)
+                        return -ENOMEM;
+                new_entry->start_blk = start_blk;
+                new_entry->count = count;
+                new_node = &new_entry->node;
+                rb_link_node(new_node, parent, n);
+                rb_insert_color(new_node, &sbi->system_blks);
+        }
+        /* Can we merge to the left? */
+        node = rb_prev(new_node);
+        if (node) {
+                entry = rb_entry(node, struct ext4_system_zone, node);
+                if (can_merge(entry, new_entry)) {
+                        new_entry->start_blk = entry->start_blk;
+                        new_entry->count += entry->count;
+                        rb_erase(node, &sbi->system_blks);
+                        kmem_cache_free(ext4_system_zone_cachep, entry);
+                }
+        }
+        /* Can we merge to the right? */
+        node = rb_next(new_node);
+        if (node) {
+                entry = rb_entry(node, struct ext4_system_zone, node);
+                if (can_merge(new_entry, entry)) {
+                        new_entry->count += entry->count;
+                        rb_erase(node, &sbi->system_blks);
+                        kmem_cache_free(ext4_system_zone_cachep, entry);
+                }
+        }
+        return 0;
+}
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+        struct rb_node *node;
+        struct ext4_system_zone *entry;
+        int first = 1;
+        printk(KERN_INFO "System zones: ");
+        node = rb_first(&sbi->system_blks);
+        while (node) {
+                entry = rb_entry(node, struct ext4_system_zone, node);
+                printk("%s%llu-%llu", first ? "" : ", ",
+                       entry->start_blk, entry->start_blk + entry->count - 1);
+                first = 0;
+                node = rb_next(node);
+        }
+        printk("\n");
+}
+int ext4_setup_system_zone(struct super_block *sb)
+{
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct ext4_group_desc *gdp;
+        ext4_group_t i;
+        int flex_size = ext4_flex_bg_size(sbi);
+        int ret;
+        if (!test_opt(sb, BLOCK_VALIDITY)) {
+                if (EXT4_SB(sb)->system_blks.rb_node)
+                        ext4_release_system_zone(sb);
+                return 0;
+        }
+        if (EXT4_SB(sb)->system_blks.rb_node)
+                return 0;
+        for (i=0; i < ngroups; i++) {
+                if (ext4_bg_has_super(sb, i) &&
+                    ((i < 5) || ((i % flex_size) == 0)))
+                        add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+                                        sbi->s_gdb_count + 1);
+                gdp = ext4_get_group_desc(sb, i, NULL);
+                ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+                if (ret)
+                        return ret;
+                ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+                if (ret)
+                        return ret;
+                ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+                                sbi->s_itb_per_group);
+                if (ret)
+                        return ret;
+        }
+        if (test_opt(sb, DEBUG))
+                debug_print_tree(EXT4_SB(sb));
+        return 0;
+}
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+        struct rb_node  *n = EXT4_SB(sb)->system_blks.rb_node;
+        struct rb_node  *parent;
+        struct ext4_system_zone *entry;
+        while (n) {
+                /* Do the node's children first */
+                if (n->rb_left) {
+                        n = n->rb_left;
+                        continue;
+                }
+                if (n->rb_right) {
+                        n = n->rb_right;
+                        continue;
+                }
+                /*
+                 * The node has no children; free it, and then zero
+                 * out parent's link to it.  Finally go to the
+                 * beginning of the loop and try to free the parent
+                 * node.
+                 */
+                parent = rb_parent(n);
+                entry = rb_entry(n, struct ext4_system_zone, node);
+                kmem_cache_free(ext4_system_zone_cachep, entry);
+                if (!parent)
+                        EXT4_SB(sb)->system_blks.rb_node = NULL;
+                else if (parent->rb_left == n)
+                        parent->rb_left = NULL;
+                else if (parent->rb_right == n)
+                        parent->rb_right = NULL;
+                n = parent;
+        }
+        EXT4_SB(sb)->system_blks.rb_node = NULL;
+}
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+                          unsigned int count)
+{
+        struct ext4_system_zone *entry;
+        struct rb_node *n = sbi->system_blks.rb_node;
+        if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+            (start_blk + count > ext4_blocks_count(sbi->s_es)))
+                return 0;
+        while (n) {
+                entry = rb_entry(n, struct ext4_system_zone, node);
+                if (start_blk + count - 1 < entry->start_blk)
+                        n = n->rb_left;
+                else if (start_blk >= (entry->start_blk + entry->count))
+                        n = n->rb_right;
+                else
+                        return 0;
+        }
+        return 1;
+}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b64789929a65..9dc93168e262 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+                err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
-                                                0, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d0f15ef56de1..cc7d5edc38c9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -21,7 +21,14 @@
 #include <linux/magic.h>
 #include <linux/jbd2.h>
 #include <linux/quota.h>
-#include "ext4_i.h"
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
 /*
 * The fourth extended filesystem constants/structures
@@ -46,6 +53,19 @@
 #define ext4_debug(f, a...)     do {} while (0)
 #endif
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE              1
 /* blocks already reserved */
@@ -179,9 +199,6 @@ struct flex_groups {
 #define EXT4_BG_BLOCK_UNINIT    0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED    0x0004 /* On-disk itable initialized to zero */
-#ifdef __KERNEL__
-#include "ext4_sb.h"
-#endif
 /*
 * Macro-instructions used to manage group descriptors
 */
@@ -297,10 +314,23 @@ struct ext4_new_group_data {
 };
 /*
- * Following is used by preallocation code to tell get_blocks() that we
+ * Flags used by ext4_get_blocks()
- * want uninitialzed extents.
 */
-#define EXT4_CREATE_UNINITIALIZED_EXT           2
+        /* Allocate any needed blocks and/or convert an unitialized
+           extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE                  0x0001
+        /* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT              0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT       (EXT4_GET_BLOCKS_UNINIT_EXT|\
+                                                 EXT4_GET_BLOCKS_CREATE)
+        /* Caller is from the delayed allocation writeout path,
+           so set the magic i_delalloc_reserve_flag after taking the 
+           inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
+        /* Call ext4_da_update_reserve_space() after successfully 
+           allocating the blocks */
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE    0x0008
 /*
 * ioctl commands
@@ -516,6 +546,110 @@ do {									       \
 #endif /* defined(__KERNEL__) || defined(__linux__) */
 /*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+        ext4_fsblk_t    ec_start;
+        ext4_lblk_t     ec_block;
+        __u32           ec_len; /* must be 32bit to return holes */
+        __u32           ec_type;
+};
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+        __le32  i_data[15];     /* unconverted */
+        __u32   i_flags;
+        ext4_fsblk_t    i_file_acl;
+        __u32   i_dtime;
+        /*
+         * i_block_group is the number of the block group which contains
+         * this file's inode.  Constant across the lifetime of the inode,
+         * it is ued for making block allocation decisions - we try to
+         * place a file's data blocks near its inode block, and new inodes
+         * near to their parent directory's inode.
+         */
+        ext4_group_t    i_block_group;
+        __u32   i_state;                /* Dynamic state flags for ext4 */
+        ext4_lblk_t             i_dir_start_lookup;
+#ifdef CONFIG_EXT4_FS_XATTR
+        /*
+         * Extended attributes can be read independently of the main file
+         * data. Taking i_mutex even when reading would cause contention
+         * between readers of EAs and writers of regular file data, so
+         * instead we synchronize on xattr_sem when reading or changing
+         * EAs.
+         */
+        struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+        struct posix_acl        *i_acl;
+        struct posix_acl        *i_default_acl;
+#endif
+        struct list_head i_orphan;      /* unlinked but open inodes */
+        /*
+         * i_disksize keeps track of what the inode size is ON DISK, not
+         * in memory.  During truncate, i_size is set to the new size by
+         * the VFS prior to calling ext4_truncate(), but the filesystem won't
+         * set i_disksize to 0 until the truncate is actually under way.
+         *
+         * The intent is that i_disksize always represents the blocks which
+         * are used by this file.  This allows recovery to restart truncate
+         * on orphans if we crash during truncate.  We actually write i_disksize
+         * into the on-disk inode when writing inodes out, instead of i_size.
+         *
+         * The only time when i_disksize and i_size may be different is when
+         * a truncate is in progress.  The only things which change i_disksize
+         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+         */
+        loff_t  i_disksize;
+        /*
+         * i_data_sem is for serialising ext4_truncate() against
+         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+         * data tree are chopped off during truncate. We can't do that in
+         * ext4 because whenever we perform intermediate commits during
+         * truncate, the inode and all the metadata blocks *must* be in a
+         * consistent state which allows truncation of the orphans to restart
+         * during recovery.  Hence we must fix the get_block-vs-truncate race
+         * by other means, so we have i_data_sem.
+         */
+        struct rw_semaphore i_data_sem;
+        struct inode vfs_inode;
+        struct jbd2_inode jinode;
+        struct ext4_ext_cache i_cached_extent;
+        /*
+         * File creation time. Its function is same as that of
+         * struct timespec i_{a,c,m}time in the generic inode.
+         */
+        struct timespec i_crtime;
+        /* mballoc */
+        struct list_head i_prealloc_list;
+        spinlock_t i_prealloc_lock;
+        /* ialloc */
+        ext4_group_t    i_last_alloc_group;
+        /* allocation reservation info for delalloc */
+        unsigned int i_reserved_data_blocks;
+        unsigned int i_reserved_meta_blocks;
+        unsigned int i_allocated_meta_blocks;
+        unsigned short i_delalloc_reserved_flag;
+        /* on-disk additional length */
+        __u16 i_extra_isize;
+        spinlock_t i_block_reservation_lock;
+};
+/*
 * File system states
 */
 #define EXT4_VALID_FS                   0x0001  /* Unmounted cleanly */
@@ -560,6 +694,7 @@ do {									       \
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT       0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY       0x20000000 /* Block validity checking */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -689,6 +824,137 @@ struct ext4_super_block {
 };
 #ifdef __KERNEL__
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
+        unsigned long s_inodes_per_block;/* Number of inodes per block */
+        unsigned long s_blocks_per_group;/* Number of blocks in a group */
+        unsigned long s_inodes_per_group;/* Number of inodes in a group */
+        unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
+        unsigned long s_gdb_count;      /* Number of group descriptor blocks */
+        unsigned long s_desc_per_block; /* Number of group descriptors per block */
+        ext4_group_t s_groups_count;    /* Number of groups in the fs */
+        unsigned long s_overhead_last;  /* Last calculated overhead */
+        unsigned long s_blocks_last;    /* Last seen block count */
+        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
+        struct buffer_head * s_sbh;     /* Buffer containing the super block */
+        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
+        struct buffer_head **s_group_desc;
+        unsigned long  s_mount_opt;
+        ext4_fsblk_t s_sb_block;
+        uid_t s_resuid;
+        gid_t s_resgid;
+        unsigned short s_mount_state;
+        unsigned short s_pad;
+        int s_addr_per_block_bits;
+        int s_desc_per_block_bits;
+        int s_inode_size;
+        int s_first_ino;
+        unsigned int s_inode_readahead_blks;
+        spinlock_t s_next_gen_lock;
+        u32 s_next_generation;
+        u32 s_hash_seed[4];
+        int s_def_hash_version;
+        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
+        struct percpu_counter s_freeblocks_counter;
+        struct percpu_counter s_freeinodes_counter;
+        struct percpu_counter s_dirs_counter;
+        struct percpu_counter s_dirtyblocks_counter;
+        struct blockgroup_lock *s_blockgroup_lock;
+        struct proc_dir_entry *s_proc;
+        struct kobject s_kobj;
+        struct completion s_kobj_unregister;
+        /* Journaling */
+        struct inode *s_journal_inode;
+        struct journal_s *s_journal;
+        struct list_head s_orphan;
+        struct mutex s_orphan_lock;
+        struct mutex s_resize_lock;
+        unsigned long s_commit_interval;
+        u32 s_max_batch_time;
+        u32 s_min_batch_time;
+        struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+        wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
+        int s_jquota_fmt;                       /* Format of quota to use */
+#endif
+        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+        struct rb_root system_blks;
+#ifdef EXTENTS_STATS
+        /* ext4 extents stats */
+        unsigned long s_ext_min;
+        unsigned long s_ext_max;
+        unsigned long s_depth_max;
+        spinlock_t s_ext_stats_lock;
+        unsigned long s_ext_blocks;
+        unsigned long s_ext_extents;
+#endif
+        /* for buddy allocator */
+        struct ext4_group_info ***s_group_info;
+        struct inode *s_buddy_cache;
+        long s_blocks_reserved;
+        spinlock_t s_reserve_lock;
+        spinlock_t s_md_lock;
+        tid_t s_last_transaction;
+        unsigned short *s_mb_offsets;
+        unsigned int *s_mb_maxs;
+        /* tunables */
+        unsigned long s_stripe;
+        unsigned int s_mb_stream_request;
+        unsigned int s_mb_max_to_scan;
+        unsigned int s_mb_min_to_scan;
+        unsigned int s_mb_stats;
+        unsigned int s_mb_order2_reqs;
+        unsigned int s_mb_group_prealloc;
+        /* where last allocation was done - for stream allocation */
+        unsigned long s_mb_last_group;
+        unsigned long s_mb_last_start;
+        /* history to debug policy */
+        struct ext4_mb_history *s_mb_history;
+        int s_mb_history_cur;
+        int s_mb_history_max;
+        int s_mb_history_num;
+        spinlock_t s_mb_history_lock;
+        int s_mb_history_filter;
+        /* stats for buddy allocator */
+        spinlock_t s_mb_pa_lock;
+        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
+        atomic_t s_bal_success; /* we found long enough chunks */
+        atomic_t s_bal_allocated;       /* in blocks */
+        atomic_t s_bal_ex_scanned;      /* total extents scanned */
+        atomic_t s_bal_goals;   /* goal hits */
+        atomic_t s_bal_breaks;  /* too long searches */
+        atomic_t s_bal_2orders; /* 2^order hits */
+        spinlock_t s_bal_lock;
+        unsigned long s_mb_buddies_generated;
+        unsigned long long s_mb_generation_time;
+        atomic_t s_mb_lost_chunks;
+        atomic_t s_mb_preallocated;
+        atomic_t s_mb_discarded;
+        /* locality groups */
+        struct ext4_locality_group *s_locality_groups;
+        /* for write statistics */
+        unsigned long s_sectors_written_start;
+        u64 s_kbytes_written;
+        unsigned int s_log_groups_per_flex;
+        struct flex_groups *s_flex_groups;
+};
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -704,7 +970,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
                current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
        return ino == EXT4_ROOT_INO ||
@@ -1014,6 +1279,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+                                      ext4_group_t block_group);
+extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+                                       struct buffer_head *bh,
+                                       ext4_group_t group,
+                                       struct ext4_group_desc *desc);
+#define ext4_free_blocks_after_init(sb, group, desc)                    \
+                ext4_init_block_bitmap(sb, NULL, group, desc)
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1038,6 +1311,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+                                       struct buffer_head *bh,
+                                       ext4_group_t group,
+                                       struct ext4_group_desc *desc);
+extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1123,6 +1401,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
                                const char *, const char *, ...)
        __attribute__ ((format (printf, 4, 5)));
@@ -1161,6 +1441,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
 extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+                                   struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+                                       struct ext4_group_desc *gdp);
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -1228,6 +1512,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
         return grp_info[indexv][indexh];
 }
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+        ext4_group_t    ngroups = EXT4_SB(sb)->s_groups_count;
+        smp_rmb();
+        return ngroups;
+}
 static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
@@ -1283,33 +1579,25 @@ struct ext4_group_info {
 };
 #define EXT4_GROUP_INFO_NEED_INIT_BIT   0
-#define EXT4_GROUP_INFO_LOCKED_BIT      1
 #define EXT4_MB_GRP_NEED_INIT(grp)      \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+                                              ext4_group_t group)
 {
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
-        bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
 }
-static inline void ext4_unlock_group(struct super_block *sb,
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-                                        ext4_group_t group)
 {
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        spin_lock(ext4_group_lock_ptr(sb, group));
-        bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
 }
-static inline int ext4_is_group_locked(struct super_block *sb,
+static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
 {
-        struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+        spin_unlock(ext4_group_lock_ptr(sb, group));
-        return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-                                                &(grinfo->bb_state));
 }
 /*
@@ -1326,11 +1614,21 @@ extern const struct file_operations ext4_file_operations;
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init init_ext4_system_zone(void);
+extern void exit_ext4_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+                                 ext4_fsblk_t start_blk,
+                                 unsigned int count);
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
@@ -1338,17 +1636,15 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                               ext4_lblk_t iblock, unsigned int max_blocks,
-                               struct buffer_head *bh_result,
+                               struct buffer_head *bh_result, int flags);
-                               int create, int extend_disksize);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
-extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
+extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
-                        sector_t block, unsigned int max_blocks,
+                           sector_t block, unsigned int max_blocks,
-                        struct buffer_head *bh, int create,
+                           struct buffer_head *bh, int flags);
-                        int extend_disksize, int flag);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644
index 4ce2187123aa..000000000000
--- a/fs/ext4/ext4_i.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- *  ext4_i.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_i.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-#ifndef _EXT4_I
-#define _EXT4_I
-#include <linux/rwsem.h>
-#include <linux/rbtree.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-/* data type for block offset of block group */
-typedef int ext4_grpblk_t;
-/* data type for filesystem-wide blocks number */
-typedef unsigned long long ext4_fsblk_t;
-/* data type for file logical block number */
-typedef __u32 ext4_lblk_t;
-/* data type for block group number */
-typedef unsigned int ext4_group_t;
-/*
- * storage for cached extent
- */
-struct ext4_ext_cache {
-        ext4_fsblk_t    ec_start;
-        ext4_lblk_t     ec_block;
-        __u32           ec_len; /* must be 32bit to return holes */
-        __u32           ec_type;
-};
-/*
- * fourth extended file system inode data in memory
- */
-struct ext4_inode_info {
-        __le32  i_data[15];     /* unconverted */
-        __u32   i_flags;
-        ext4_fsblk_t    i_file_acl;
-        __u32   i_dtime;
-        /*
-         * i_block_group is the number of the block group which contains
-         * this file's inode.  Constant across the lifetime of the inode,
-         * it is ued for making block allocation decisions - we try to
-         * place a file's data blocks near its inode block, and new inodes
-         * near to their parent directory's inode.
-         */
-        ext4_group_t    i_block_group;
-        __u32   i_state;                /* Dynamic state flags for ext4 */
-        ext4_lblk_t             i_dir_start_lookup;
-#ifdef CONFIG_EXT4_FS_XATTR
-        /*
-         * Extended attributes can be read independently of the main file
-         * data. Taking i_mutex even when reading would cause contention
-         * between readers of EAs and writers of regular file data, so
-         * instead we synchronize on xattr_sem when reading or changing
-         * EAs.
-         */
-        struct rw_semaphore xattr_sem;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-        struct posix_acl        *i_acl;
-        struct posix_acl        *i_default_acl;
-#endif
-        struct list_head i_orphan;      /* unlinked but open inodes */
-        /*
-         * i_disksize keeps track of what the inode size is ON DISK, not
-         * in memory.  During truncate, i_size is set to the new size by
-         * the VFS prior to calling ext4_truncate(), but the filesystem won't
-         * set i_disksize to 0 until the truncate is actually under way.
-         *
-         * The intent is that i_disksize always represents the blocks which
-         * are used by this file.  This allows recovery to restart truncate
-         * on orphans if we crash during truncate.  We actually write i_disksize
-         * into the on-disk inode when writing inodes out, instead of i_size.
-         *
-         * The only time when i_disksize and i_size may be different is when
-         * a truncate is in progress.  The only things which change i_disksize
-         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
-         */
-        loff_t  i_disksize;
-        /*
-         * i_data_sem is for serialising ext4_truncate() against
-         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
-         * data tree are chopped off during truncate. We can't do that in
-         * ext4 because whenever we perform intermediate commits during
-         * truncate, the inode and all the metadata blocks *must* be in a
-         * consistent state which allows truncation of the orphans to restart
-         * during recovery.  Hence we must fix the get_block-vs-truncate race
-         * by other means, so we have i_data_sem.
-         */
-        struct rw_semaphore i_data_sem;
-        struct inode vfs_inode;
-        struct jbd2_inode jinode;
-        struct ext4_ext_cache i_cached_extent;
-        /*
-         * File creation time. Its function is same as that of
-         * struct timespec i_{a,c,m}time in the generic inode.
-         */
-        struct timespec i_crtime;
-        /* mballoc */
-        struct list_head i_prealloc_list;
-        spinlock_t i_prealloc_lock;
-        /* ialloc */
-        ext4_group_t    i_last_alloc_group;
-        /* allocation reservation info for delalloc */
-        unsigned int i_reserved_data_blocks;
-        unsigned int i_reserved_meta_blocks;
-        unsigned int i_allocated_meta_blocks;
-        unsigned short i_delalloc_reserved_flag;
-        /* on-disk additional length */
-        __u16 i_extra_isize;
-        spinlock_t i_block_reservation_lock;
-};
-#endif  /* _EXT4_I */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644
index 57b71fefbccf..000000000000
--- a/fs/ext4/ext4_sb.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- *  ext4_sb.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_sb.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-#ifndef _EXT4_SB
-#define _EXT4_SB
-#ifdef __KERNEL__
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
-#endif
-#include <linux/rbtree.h>
-/*
- * fourth extended-fs super-block data in memory
- */
-struct ext4_sb_info {
-        unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
-        unsigned long s_inodes_per_block;/* Number of inodes per block */
-        unsigned long s_blocks_per_group;/* Number of blocks in a group */
-        unsigned long s_inodes_per_group;/* Number of inodes in a group */
-        unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
-        unsigned long s_gdb_count;      /* Number of group descriptor blocks */
-        unsigned long s_desc_per_block; /* Number of group descriptors per block */
-        ext4_group_t s_groups_count;    /* Number of groups in the fs */
-        unsigned long s_overhead_last;  /* Last calculated overhead */
-        unsigned long s_blocks_last;    /* Last seen block count */
-        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
-        struct buffer_head * s_sbh;     /* Buffer containing the super block */
-        struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
-        struct buffer_head **s_group_desc;
-        unsigned long  s_mount_opt;
-        ext4_fsblk_t s_sb_block;
-        uid_t s_resuid;
-        gid_t s_resgid;
-        unsigned short s_mount_state;
-        unsigned short s_pad;
-        int s_addr_per_block_bits;
-        int s_desc_per_block_bits;
-        int s_inode_size;
-        int s_first_ino;
-        unsigned int s_inode_readahead_blks;
-        spinlock_t s_next_gen_lock;
-        u32 s_next_generation;
-        u32 s_hash_seed[4];
-        int s_def_hash_version;
-        int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
-        struct percpu_counter s_freeblocks_counter;
-        struct percpu_counter s_freeinodes_counter;
-        struct percpu_counter s_dirs_counter;
-        struct percpu_counter s_dirtyblocks_counter;
-        struct blockgroup_lock *s_blockgroup_lock;
-        struct proc_dir_entry *s_proc;
-        struct kobject s_kobj;
-        struct completion s_kobj_unregister;
-        /* Journaling */
-        struct inode *s_journal_inode;
-        struct journal_s *s_journal;
-        struct list_head s_orphan;
-        unsigned long s_commit_interval;
-        u32 s_max_batch_time;
-        u32 s_min_batch_time;
-        struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-        struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
-        wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
-#endif
-#ifdef CONFIG_QUOTA
-        char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
-        int s_jquota_fmt;                       /* Format of quota to use */
-#endif
-        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-#ifdef EXTENTS_STATS
-        /* ext4 extents stats */
-        unsigned long s_ext_min;
-        unsigned long s_ext_max;
-        unsigned long s_depth_max;
-        spinlock_t s_ext_stats_lock;
-        unsigned long s_ext_blocks;
-        unsigned long s_ext_extents;
-#endif
-        /* for buddy allocator */
-        struct ext4_group_info ***s_group_info;
-        struct inode *s_buddy_cache;
-        long s_blocks_reserved;
-        spinlock_t s_reserve_lock;
-        spinlock_t s_md_lock;
-        tid_t s_last_transaction;
-        unsigned short *s_mb_offsets;
-        unsigned int *s_mb_maxs;
-        /* tunables */
-        unsigned long s_stripe;
-        unsigned int s_mb_stream_request;
-        unsigned int s_mb_max_to_scan;
-        unsigned int s_mb_min_to_scan;
-        unsigned int s_mb_stats;
-        unsigned int s_mb_order2_reqs;
-        unsigned int s_mb_group_prealloc;
-        /* where last allocation was done - for stream allocation */
-        unsigned long s_mb_last_group;
-        unsigned long s_mb_last_start;
-        /* history to debug policy */
-        struct ext4_mb_history *s_mb_history;
-        int s_mb_history_cur;
-        int s_mb_history_max;
-        int s_mb_history_num;
-        spinlock_t s_mb_history_lock;
-        int s_mb_history_filter;
-        /* stats for buddy allocator */
-        spinlock_t s_mb_pa_lock;
-        atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
-        atomic_t s_bal_success; /* we found long enough chunks */
-        atomic_t s_bal_allocated;       /* in blocks */
-        atomic_t s_bal_ex_scanned;      /* total extents scanned */
-        atomic_t s_bal_goals;   /* goal hits */
-        atomic_t s_bal_breaks;  /* too long searches */
-        atomic_t s_bal_2orders; /* 2^order hits */
-        spinlock_t s_bal_lock;
-        unsigned long s_mb_buddies_generated;
-        unsigned long long s_mb_generation_time;
-        atomic_t s_mb_lost_chunks;
-        atomic_t s_mb_preallocated;
-        atomic_t s_mb_discarded;
-        /* locality groups */
-        struct ext4_locality_group *s_locality_groups;
-        /* for write statistics */
-        unsigned long s_sectors_written_start;
-        u64 s_kbytes_written;
-        unsigned int s_log_groups_per_flex;
-        struct flex_groups *s_flex_groups;
-};
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-        return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-#endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e3a55eb8b26a..2593f748c3a4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-        ext4_fsblk_t block = ext_pblock(ext), valid_block;
+        ext4_fsblk_t block = ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
-        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-        valid_block = le32_to_cpu(es->s_first_data_block) +
+        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
-                EXT4_SB(inode->i_sb)->s_gdb_count;
-        if (unlikely(block <= valid_block ||
-                     ((block + len) > ext4_blocks_count(es))))
-                return 0;
-        else
-                return 1;
 }
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-        ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
+        ext4_fsblk_t block = idx_pblock(ext_idx);
-        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
-        valid_block = le32_to_cpu(es->s_first_data_block) +
+        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
-                EXT4_SB(inode->i_sb)->s_gdb_count;
-        if (unlikely(block <= valid_block ||
-                     (block >= ext4_blocks_count(es))))
-                return 0;
-        else
-                return 1;
 }
 static int ext4_valid_extent_entries(struct inode *inode,
@@ -2097,12 +2083,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex = EXT_LAST_EXTENT(eh);
        ex_ee_block = le32_to_cpu(ex->ee_block);
-        if (ext4_ext_is_uninitialized(ex))
-                uninitialized = 1;
        ex_ee_len = ext4_ext_get_actual_len(ex);
        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {
+                if (ext4_ext_is_uninitialized(ex))
+                        uninitialized = 1;
+                else
+                        uninitialized = 0;
                ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
                path[depth].p_ext = ex;
@@ -2784,7 +2774,7 @@ fix_extent_len:
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned int max_blocks, struct buffer_head *bh_result,
-                        int create, int extend_disksize)
+                        int flags)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
@@ -2793,7 +2783,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
-        loff_t disksize;
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2803,7 +2792,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
        if (cache_type) {
                if (cache_type == EXT4_EXT_CACHE_GAP) {
-                        if (!create) {
+                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
                                 * user doesn't want to allocate it
@@ -2869,9 +2858,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
-                        if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+                        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
                                goto out;
-                        if (!create) {
+                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+                                if (allocated > max_blocks)
+                                        allocated = max_blocks;
                                /*
                                 * We have blocks reserved already.  We
                                 * return allocated blocks so that delalloc
@@ -2879,8 +2870,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                 * the buffer head will be unmapped so that
                                 * a read from the block returns 0s.
                                 */
-                                if (allocated > max_blocks)
-                                        allocated = max_blocks;
                                set_buffer_unwritten(bh_result);
                                bh_result->b_bdev = inode->i_sb->s_bdev;
                                bh_result->b_blocknr = newblock;
@@ -2903,7 +2892,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * requested block isn't allocated yet;
         * we couldn't try to create block if create flag is zero
         */
-        if (!create) {
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                /*
                 * put just found gap into cache to speed up
                 * subsequent requests
@@ -2932,10 +2921,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * EXT_UNINIT_MAX_LEN.
         */
        if (max_blocks > EXT_INIT_MAX_LEN &&
-            create != EXT4_CREATE_UNINITIALIZED_EXT)
+            !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
                max_blocks = EXT_INIT_MAX_LEN;
        else if (max_blocks > EXT_UNINIT_MAX_LEN &&
-                 create == EXT4_CREATE_UNINITIALIZED_EXT)
+                 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
                max_blocks = EXT_UNINIT_MAX_LEN;
        /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2955,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
        newex.ee_len = cpu_to_le16(ar.len);
-        if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
+        if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
                ext4_ext_mark_uninitialized(&newex);
        err = ext4_ext_insert_extent(handle, inode, path, &newex);
        if (err) {
@@ -2983,18 +2972,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-        if (extend_disksize) {
-                disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
-                if (disksize > i_size_read(inode))
-                        disksize = i_size_read(inode);
-                if (disksize > EXT4_I(inode)->i_disksize)
-                        EXT4_I(inode)->i_disksize = disksize;
-        }
        set_buffer_new(bh_result);
        /* Cache only when it is _not_ an uninitialized extent */
-        if (create != EXT4_CREATE_UNINITIALIZED_EXT)
+        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
 out:
@@ -3150,9 +3131,10 @@ retry:
                        ret = PTR_ERR(handle);
                        break;
                }
-                ret = ext4_get_blocks_wrap(handle, inode, block,
+                map_bh.b_state = 0;
-                                          max_blocks, &map_bh,
+                ret = ext4_get_blocks(handle, inode, block,
-                                          EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+                                      max_blocks, &map_bh,
+                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
@@ -3195,7 +3177,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                       void *data)
 {
        struct fiemap_extent_info *fieinfo = data;
-        unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+        unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        __u64   logical;
        __u64   physical;
        __u64   length;
@@ -3242,9 +3224,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
         *
         * XXX this might miss a single-block extent at EXT_MAX_BLOCK
         */
-        if (logical + length - 1 == EXT_MAX_BLOCK ||
+        if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-            ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+            newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
+                loff_t size = i_size_read(inode);
+                loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
                flags |= FIEMAP_EXTENT_LAST;
+                if ((flags & FIEMAP_EXTENT_DELALLOC) &&
+                    logical+length > size)
+                        length = (size - logical + bs - 1) & ~(bs-1);
+        }
        error = fiemap_fill_next_extent(fieinfo, logical, physical,
                                        length, flags);
@@ -3318,10 +3307,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 * Walk the extent tree gathering extent information.
                 * ext4_ext_fiemap_cb will push extents back to user.
                 */
-                down_write(&EXT4_I(inode)->i_data_sem);
+                down_read(&EXT4_I(inode)->i_data_sem);
                error = ext4_ext_walk_space(inode, start_blk, len_blks,
                                          ext4_ext_fiemap_cb, fieinfo);
-                up_write(&EXT4_I(inode)->i_data_sem);
+                up_read(&EXT4_I(inode)->i_data_sem);
        }
        return error;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index c2c0a8d06d0e..000000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-                                   struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
-                                       struct ext4_group_desc *gdp);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-                                      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-                                       struct buffer_head *bh,
-                                       ext4_group_t group,
-                                       struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)                    \
-                ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                       struct buffer_head *bh,
-                                       ext4_group_t group,
-                                       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a08a6b5..3743bd849bce 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -27,7 +27,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
 /*
 * ialloc.c contains the inodes allocation and deallocation routines
@@ -123,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
-        spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-                spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
-        spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+        ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -247,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                goto error_return;
        /* Ok, now we can actually update the inode bitmaps.. */
-        spin_lock(sb_bgl_lock(sbi, block_group));
+        cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+                                        bit, bitmap_bh->b_data);
-        spin_unlock(sb_bgl_lock(sbi, block_group));
        if (!cleared)
                ext4_error(sb, "ext4_free_inode",
                           "bit already cleared for inode %lu", ino);
@@ -261,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                if (fatal) goto error_return;
                if (gdp) {
-                        spin_lock(sb_bgl_lock(sbi, block_group));
+                        ext4_lock_group(sb, block_group);
                        count = ext4_free_inodes_count(sb, gdp) + 1;
                        ext4_free_inodes_set(sb, gdp, count);
                        if (is_directory) {
@@ -277,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                        }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
-                        spin_unlock(sb_bgl_lock(sbi, block_group));
+                        ext4_unlock_group(sb, block_group);
                        percpu_counter_inc(&sbi->s_freeinodes_counter);
                        if (is_directory)
                                percpu_counter_dec(&sbi->s_dirs_counter);
@@ -316,7 +314,7 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent,
                                ext4_group_t *best_group)
 {
-        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        unsigned int freei, avefreei;
        struct ext4_group_desc *desc, *best_desc = NULL;
        ext4_group_t group;
@@ -349,11 +347,10 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *desc;
-        struct buffer_head *bh;
        struct flex_groups *flex_group = sbi->s_flex_groups;
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-        ext4_group_t ngroups = sbi->s_groups_count;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        int flex_size = ext4_flex_bg_size(sbi);
        ext4_group_t best_flex = parent_fbg_group;
        int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +359,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
        ext4_group_t n_fbg_groups;
        ext4_group_t i;
-        n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+        n_fbg_groups = (ngroups + flex_size - 1) >>
                sbi->s_log_groups_per_flex;
 find_close_to_parent:
@@ -404,7 +401,7 @@ find_close_to_parent:
 found_flexbg:
        for (i = best_flex * flex_size; i < ngroups &&
                     i < (best_flex + 1) * flex_size; i++) {
-                desc = ext4_get_group_desc(sb, i, &bh);
+                desc = ext4_get_group_desc(sb, i, NULL);
                if (ext4_free_inodes_count(sb, desc)) {
                        *best_group = i;
                        goto out;
@@ -478,20 +475,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        ext4_group_t ngroups = sbi->s_groups_count;
+        ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
        ext4_fsblk_t freeb, avefreeb;
        unsigned int ndirs;
        int max_dirs, min_inodes;
        ext4_grpblk_t min_blocks;
-        ext4_group_t i, grp, g;
+        ext4_group_t i, grp, g, ngroups;
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
        int flex_size = ext4_flex_bg_size(sbi);
+        ngroups = real_ngroups;
        if (flex_size > 1) {
-                ngroups = (ngroups + flex_size - 1) >>
+                ngroups = (real_ngroups + flex_size - 1) >>
                        sbi->s_log_groups_per_flex;
                parent_group >>= sbi->s_log_groups_per_flex;
        }
@@ -543,7 +541,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                 */
                grp *= flex_size;
                for (i = 0; i < flex_size; i++) {
-                        if (grp+i >= sbi->s_groups_count)
+                        if (grp+i >= real_ngroups)
                                break;
                        desc = ext4_get_group_desc(sb, grp+i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +581,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        }
 fallback:
-        ngroups = sbi->s_groups_count;
+        ngroups = real_ngroups;
        avefreei = freei / ngroups;
 fallback_retry:
        parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +611,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+        ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
-        ext4_group_t i, last;
        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
        /*
@@ -708,10 +705,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 /*
 * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's sb_bgl_lock
+ * is uninit we need to take the groups's ext4_group_lock
 * and clear the uninit flag. The inode bitmap update
 * and group desc uninit flag clear should be done
- * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * after holding ext4_group_lock so that ext4_read_inode_bitmap
 * doesn't race with the ext4_claim_inode
 */
 static int ext4_claim_inode(struct super_block *sb,
@@ -722,7 +719,7 @@ static int ext4_claim_inode(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
-        spin_lock(sb_bgl_lock(sbi, group));
+        ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
                retval = 1;
@@ -731,7 +728,7 @@ static int ext4_claim_inode(struct super_block *sb,
        ino++;
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
-                spin_unlock(sb_bgl_lock(sbi, group));
+                ext4_unlock_group(sb, group);
                ext4_error(sb, __func__,
                           "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
@@ -780,7 +777,7 @@ static int ext4_claim_inode(struct super_block *sb,
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
-        spin_unlock(sb_bgl_lock(sbi, group));
+        ext4_unlock_group(sb, group);
        return retval;
 }
@@ -799,11 +796,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
        struct buffer_head *group_desc_bh;
-        ext4_group_t group = 0;
+        ext4_group_t ngroups, group = 0;
        unsigned long ino = 0;
        struct inode *inode;
        struct ext4_group_desc *gdp = NULL;
-        struct ext4_super_block *es;
        struct ext4_inode_info *ei;
        struct ext4_sb_info *sbi;
        int ret2, err = 0;
@@ -818,15 +814,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                return ERR_PTR(-EPERM);
        sb = dir->i_sb;
+        ngroups = ext4_get_groups_count(sb);
        trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
                   dir->i_ino, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        ei = EXT4_I(inode);
        sbi = EXT4_SB(sb);
-        es = sbi->s_es;
        if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
                ret2 = find_group_flex(sb, dir, &group);
@@ -856,7 +851,7 @@ got_group:
        if (ret2 == -1)
                goto out;
-        for (i = 0; i < sbi->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                err = -EIO;
                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -917,7 +912,7 @@ repeat_in_this_group:
                 * group descriptor metadata has not yet been updated.
                 * So we just go onto the next blockgroup.
                 */
-                if (++group == sbi->s_groups_count)
+                if (++group == ngroups)
                        group = 0;
        }
        err = -ENOSPC;
@@ -938,7 +933,7 @@ got:
                }
                free = 0;
-                spin_lock(sb_bgl_lock(sbi, group));
+                ext4_lock_group(sb, group);
                /* recheck and clear flag under lock if we still need to */
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -947,7 +942,7 @@ got:
                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
                                                                gdp);
                }
-                spin_unlock(sb_bgl_lock(sbi, group));
+                ext4_unlock_group(sb, group);
                /* Don't need to dirty bitmap block if we didn't change it */
                if (free) {
@@ -1158,7 +1153,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
        unsigned long desc_count;
        struct ext4_group_desc *gdp;
-        ext4_group_t i;
+        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        unsigned long bitmap_count, x;
@@ -1168,7 +1163,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
@@ -1190,7 +1185,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
        return desc_count;
 #else
        desc_count = 0;
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
@@ -1205,9 +1200,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 unsigned long ext4_count_dirs(struct super_block * sb)
 {
        unsigned long count = 0;
-        ext4_group_t i;
+        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a9ffd528dd1..875db944b22f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode,
 }
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-                                 __le32 *p, unsigned int max) {
+                                 __le32 *p, unsigned int max)
+{
-        unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
        __le32 *bref = p;
+        unsigned int blk;
        while (bref < p+max) {
-                if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
+                blk = le32_to_cpu(*bref++);
+                if (blk && 
+                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 
+                                                    blk, 1))) {
                        ext4_error(inode->i_sb, function,
-                                   "block reference %u >= max (%u) "
+                                   "invalid block reference %u "
-                                   "in inode #%lu, offset=%d",
+                                   "in inode #%lu", blk, inode->i_ino);
-                                   le32_to_cpu(*bref), maxblocks,
-                                   inode->i_ino, (int)(bref-p));
                        return -EIO;
                }
-                bref++;
        }
        return 0;
 }
@@ -892,6 +893,10 @@ err_out:
 }
 /*
+ * The ext4_ind_get_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_get_blocks().
+ *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
 * to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,15 +914,16 @@ err_out:
 * return = 0, if plain lookup failed.
 * return < 0, error case.
 *
- *
+ * The ext4_ind_get_blocks() function should be called with
- * Need to be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
 */
-static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
                                  ext4_lblk_t iblock, unsigned int maxblocks,
                                  struct buffer_head *bh_result,
-                                  int create, int extend_disksize)
+                                  int flags)
 {
        int err = -EIO;
        ext4_lblk_t offsets[4];
@@ -927,14 +933,11 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        int indirect_blks;
        int blocks_to_boundary = 0;
        int depth;
-        struct ext4_inode_info *ei = EXT4_I(inode);
        int count = 0;
        ext4_fsblk_t first_block = 0;
-        loff_t disksize;
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
-        J_ASSERT(handle != NULL || create == 0);
+        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, iblock, offsets,
                                        &blocks_to_boundary);
@@ -963,7 +966,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        }
        /* Next simple case - plain lookup or failed read of indirect block */
-        if (!create || err == -EIO)
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
                goto cleanup;
        /*
@@ -997,19 +1000,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext4_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
-        /*
+        else 
-         * i_disksize growing is protected by i_data_sem.  Don't forget to
-         * protect it if you're about to implement concurrent
-         * ext4_get_block() -bzzz
-        */
-        if (!err && extend_disksize) {
-                disksize = ((loff_t) iblock + count) << inode->i_blkbits;
-                if (disksize > i_size_read(inode))
-                        disksize = i_size_read(inode);
-                if (disksize > ei->i_disksize)
-                        ei->i_disksize = disksize;
-        }
-        if (err)
                goto cleanup;
        set_buffer_new(bh_result);
@@ -1120,8 +1111,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
                ext4_discard_preallocations(inode);
 }
+static int check_block_validity(struct inode *inode, sector_t logical,
+                                sector_t phys, int len)
+{
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+                ext4_error(inode->i_sb, "check_block_validity",
+                           "inode #%lu logical block %llu mapped to %llu "
+                           "(size %d)", inode->i_ino,
+                           (unsigned long long) logical,
+                           (unsigned long long) phys, len);
+                WARN_ON(1);
+                return -EIO;
+        }
+        return 0;
+}
 /*
- * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * The ext4_get_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1129,7 +1135,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 * mapped.
 *
 * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocate.
@@ -1142,9 +1148,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 *
 * It returns the error in case of allocation failure.
 */
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
+int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
-                        unsigned int max_blocks, struct buffer_head *bh,
+                    unsigned int max_blocks, struct buffer_head *bh,
-                        int create, int extend_disksize, int flag)
+                    int flags)
 {
        int retval;
@@ -1152,21 +1158,28 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
        clear_buffer_unwritten(bh);
        /*
-         * Try to see if we can get  the block without requesting
+         * Try to see if we can get the block without requesting a new
-         * for new file system block.
+         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-                                bh, 0, 0);
+                                bh, 0);
        } else {
-                retval = ext4_get_blocks_handle(handle,
+                retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
-                                inode, block, max_blocks, bh, 0, 0);
+                                             bh, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
+        if (retval > 0 && buffer_mapped(bh)) {
+                int ret = check_block_validity(inode, block, 
+                                               bh->b_blocknr, retval);
+                if (ret != 0)
+                        return ret;
+        }
        /* If it is only a block(s) look up */
-        if (!create)
+        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
                return retval;
        /*
@@ -1205,7 +1218,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         * let the underlying get_block() function know to
         * avoid double accounting
         */
-        if (flag)
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
        /*
         * We need to check for EXT4 here because migrate
@@ -1213,10 +1226,10 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         */
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-                                bh, create, extend_disksize);
+                                              bh, flags);
        } else {
-                retval = ext4_get_blocks_handle(handle, inode, block,
+                retval = ext4_ind_get_blocks(handle, inode, block,
-                                max_blocks, bh, create, extend_disksize);
+                                             max_blocks, bh, flags);
                if (retval > 0 && buffer_new(bh)) {
                        /*
@@ -1229,18 +1242,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                }
        }
-        if (flag) {
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
-                /*
-                 * Update reserved blocks/metadata blocks
+        /*
-                 * after successful block allocation
+         * Update reserved blocks/metadata blocks after successful
-                 * which were deferred till now
+         * block allocation which had been deferred till now.
-                 */
+         */
-                if ((retval > 0) && buffer_delay(bh))
+        if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
-                        ext4_da_update_reserve_space(inode, retval);
+                ext4_da_update_reserve_space(inode, retval);
-        }
        up_write((&EXT4_I(inode)->i_data_sem));
+        if (retval > 0 && buffer_mapped(bh)) {
+                int ret = check_block_validity(inode, block, 
+                                               bh->b_blocknr, retval);
+                if (ret != 0)
+                        return ret;
+        }
        return retval;
 }
@@ -1268,8 +1286,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
                started = 1;
        }
-        ret = ext4_get_blocks_wrap(handle, inode, iblock,
+        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-                                        max_blocks, bh_result, create, 0, 0);
+                              create ? EXT4_GET_BLOCKS_CREATE : 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1288,17 +1306,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
        struct buffer_head dummy;
        int fatal = 0, err;
+        int flags = 0;
        J_ASSERT(handle != NULL || create == 0);
        dummy.b_state = 0;
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
-        err = ext4_get_blocks_wrap(handle, inode, block, 1,
+        if (create)
-                                        &dummy, create, 1, 0);
+                flags |= EXT4_GET_BLOCKS_CREATE;
+        err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
        /*
-         * ext4_get_blocks_handle() returns number of blocks
+         * ext4_get_blocks() returns number of blocks mapped. 0 in
-         * mapped. 0 in case of a HOLE.
+         * case of a HOLE.
         */
        if (err > 0) {
                if (err > 1)
@@ -1439,7 +1459,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                                struct page **pagep, void **fsdata)
 {
        struct inode *inode = mapping->host;
-        int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct page *page;
@@ -1450,6 +1470,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                   "dev %s ino %lu pos %llu len %u flags %u",
                   inode->i_sb->s_id, inode->i_ino,
                   (unsigned long long) pos, len, flags);
+        /*
+         * Reserve one block more for addition to orphan list in case
+         * we allocate blocks but write fails for some reason
+         */
+        needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1483,15 +1508,30 @@ retry:
        if (ret) {
                unlock_page(page);
-                ext4_journal_stop(handle);
                page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
+                 *
+                 * Add inode to orphan list in case we crash before
+                 * truncate finishes
                 */
                if (pos + len > inode->i_size)
+                        ext4_orphan_add(handle, inode);
+                ext4_journal_stop(handle);
+                if (pos + len > inode->i_size) {
                        vmtruncate(inode, inode->i_size);
+                        /* 
+                         * If vmtruncate failed early the inode might
+                         * still be on the orphan list; we need to
+                         * make sure the inode is removed from the
+                         * orphan list in that case.
+                         */
+                        if (inode->i_nlink)
+                                ext4_orphan_del(NULL, inode);
+                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1509,6 +1549,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
        return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
+static int ext4_generic_write_end(struct file *file,
+                                struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata)
+{
+        int i_size_changed = 0;
+        struct inode *inode = mapping->host;
+        handle_t *handle = ext4_journal_current_handle();
+        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+        /*
+         * No need to use i_size_read() here, the i_size
+         * cannot change under us because we hold i_mutex.
+         *
+         * But it's important to update i_size while still holding page lock:
+         * page writeout could otherwise come in and zero beyond i_size.
+         */
+        if (pos + copied > inode->i_size) {
+                i_size_write(inode, pos + copied);
+                i_size_changed = 1;
+        }
+        if (pos + copied >  EXT4_I(inode)->i_disksize) {
+                /* We need to mark inode dirty even if
+                 * new_i_size is less that inode->i_size
+                 * bu greater than i_disksize.(hint delalloc)
+                 */
+                ext4_update_i_disksize(inode, (pos + copied));
+                i_size_changed = 1;
+        }
+        unlock_page(page);
+        page_cache_release(page);
+        /*
+         * Don't mark the inode dirty under page lock. First, it unnecessarily
+         * makes the holding time of page lock longer. Second, it forces lock
+         * ordering of page lock and transaction start for journaling
+         * filesystems.
+         */
+        if (i_size_changed)
+                ext4_mark_inode_dirty(handle, inode);
+        return copied;
+}
 /*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
@@ -1532,21 +1618,15 @@ static int ext4_ordered_write_end(struct file *file,
        ret = ext4_jbd2_file_inode(handle, inode);
        if (ret == 0) {
-                loff_t new_i_size;
+                ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-                new_i_size = pos + copied;
-                if (new_i_size > EXT4_I(inode)->i_disksize) {
-                        ext4_update_i_disksize(inode, new_i_size);
-                        /* We need to mark inode dirty even if
-                         * new_i_size is less that inode->i_size
-                         * bu greater than i_disksize.(hint delalloc)
-                         */
-                        ext4_mark_inode_dirty(handle, inode);
-                }
-                ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
+                if (pos + len > inode->i_size)
+                        /* if we have allocated more blocks and copied
+                         * less. We will have blocks allocated outside
+                         * inode->i_size. So truncate them
+                         */
+                        ext4_orphan_add(handle, inode);
                if (ret2 < 0)
                        ret = ret2;
        }
@@ -1554,6 +1634,18 @@ static int ext4_ordered_write_end(struct file *file,
        if (!ret)
                ret = ret2;
+        if (pos + len > inode->i_size) {
+                vmtruncate(inode, inode->i_size);
+                /* 
+                 * If vmtruncate failed early the inode might still be
+                 * on the orphan list; we need to make sure the inode
+                 * is removed from the orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
        return ret ? ret : copied;
 }
@@ -1565,25 +1657,21 @@ static int ext4_writeback_write_end(struct file *file,
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
-        loff_t new_i_size;
        trace_mark(ext4_writeback_write_end,
                   "dev %s ino %lu pos %llu len %u copied %u",
                   inode->i_sb->s_id, inode->i_ino,
                   (unsigned long long) pos, len, copied);
-        new_i_size = pos + copied;
+        ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
-        if (new_i_size > EXT4_I(inode)->i_disksize) {
-                ext4_update_i_disksize(inode, new_i_size);
-                /* We need to mark inode dirty even if
-                 * new_i_size is less that inode->i_size
-                 * bu greater than i_disksize.(hint delalloc)
-                 */
-                ext4_mark_inode_dirty(handle, inode);
-        }
-        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
+        if (pos + len > inode->i_size)
+                /* if we have allocated more blocks and copied
+                 * less. We will have blocks allocated outside
+                 * inode->i_size. So truncate them
+                 */
+                ext4_orphan_add(handle, inode);
        if (ret2 < 0)
                ret = ret2;
@@ -1591,6 +1679,17 @@ static int ext4_writeback_write_end(struct file *file,
        if (!ret)
                ret = ret2;
+        if (pos + len > inode->i_size) {
+                vmtruncate(inode, inode->i_size);
+                /* 
+                 * If vmtruncate failed early the inode might still be
+                 * on the orphan list; we need to make sure the inode
+                 * is removed from the orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
        return ret ? ret : copied;
 }
@@ -1635,10 +1734,27 @@ static int ext4_journalled_write_end(struct file *file,
        }
        unlock_page(page);
+        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                /* if we have allocated more blocks and copied
+                 * less. We will have blocks allocated outside
+                 * inode->i_size. So truncate them
+                 */
+                ext4_orphan_add(handle, inode);
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-        page_cache_release(page);
+        if (pos + len > inode->i_size) {
+                vmtruncate(inode, inode->i_size);
+                /* 
+                 * If vmtruncate failed early the inode might still be
+                 * on the orphan list; we need to make sure the inode
+                 * is removed from the orphan list in that case.
+                 */
+                if (inode->i_nlink)
+                        ext4_orphan_del(NULL, inode);
+        }
        return ret ? ret : copied;
 }
@@ -1852,7 +1968,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 * @logical - first logical block to start assignment with
 *
 * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay
+ * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
 */
 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                 struct buffer_head *exbh)
@@ -1902,16 +2018,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                        do {
                                if (cur_logical >= logical + blocks)
                                        break;
-                                if (buffer_delay(bh)) {
-                                        bh->b_blocknr = pblock;
+                                if (buffer_delay(bh) ||
-                                        clear_buffer_delay(bh);
+                                                buffer_unwritten(bh)) {
-                                        bh->b_bdev = inode->i_sb->s_bdev;
-                                } else if (buffer_unwritten(bh)) {
+                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
-                                        bh->b_blocknr = pblock;
-                                        clear_buffer_unwritten(bh);
+                                        if (buffer_delay(bh)) {
-                                        set_buffer_mapped(bh);
+                                                clear_buffer_delay(bh);
-                                        set_buffer_new(bh);
+                                                bh->b_blocknr = pblock;
-                                        bh->b_bdev = inode->i_sb->s_bdev;
+                                        } else {
+                                                /*
+                                                 * unwritten already should have
+                                                 * blocknr assigned. Verify that
+                                                 */
+                                                clear_buffer_unwritten(bh);
+                                                BUG_ON(bh->b_blocknr != pblock);
+                                        }
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
@@ -1990,51 +2114,6 @@ static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
-#define         EXT4_DELALLOC_RSVED     1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-                                   struct buffer_head *bh_result, int create)
-{
-        int ret;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        loff_t disksize = EXT4_I(inode)->i_disksize;
-        handle_t *handle = NULL;
-        handle = ext4_journal_current_handle();
-        BUG_ON(!handle);
-        ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                                   bh_result, create, 0, EXT4_DELALLOC_RSVED);
-        if (ret <= 0)
-                return ret;
-        bh_result->b_size = (ret << inode->i_blkbits);
-        if (ext4_should_order_data(inode)) {
-                int retval;
-                retval = ext4_jbd2_file_inode(handle, inode);
-                if (retval)
-                        /*
-                         * Failed to add inode for ordered mode. Don't
-                         * update file size
-                         */
-                        return retval;
-        }
-        /*
-         * Update on-disk size along with block allocation we don't
-         * use 'extend_disksize' as size may change within already
-         * allocated block -bzzz
-         */
-        disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-        if (disksize > i_size_read(inode))
-                disksize = i_size_read(inode);
-        if (disksize > EXT4_I(inode)->i_disksize) {
-                ext4_update_i_disksize(inode, disksize);
-                ret = ext4_mark_inode_dirty(handle, inode);
-                return ret;
-        }
-        return 0;
-}
 /*
 * mpage_da_map_blocks - go through given space
 *
@@ -2045,29 +2124,57 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-        int err = 0;
+        int err, blks, get_blocks_flags;
        struct buffer_head new;
-        sector_t next;
+        sector_t next = mpd->b_blocknr;
+        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+        handle_t *handle = NULL;
        /*
         * We consider only non-mapped and non-allocated blocks
         */
        if ((mpd->b_state  & (1 << BH_Mapped)) &&
-            !(mpd->b_state & (1 << BH_Delay)))
+                !(mpd->b_state & (1 << BH_Delay)) &&
+                !(mpd->b_state & (1 << BH_Unwritten)))
                return 0;
-        new.b_state = mpd->b_state;
-        new.b_blocknr = 0;
-        new.b_size = mpd->b_size;
-        next = mpd->b_blocknr;
        /*
-         * If we didn't accumulate anything
+         * If we didn't accumulate anything to write simply return
-         * to write simply return
         */
-        if (!new.b_size)
+        if (!mpd->b_size)
                return 0;
-        err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+        handle = ext4_journal_current_handle();
-        if (err) {
+        BUG_ON(!handle);
+        /*
+         * Call ext4_get_blocks() to allocate any delayed allocation
+         * blocks, or to convert an uninitialized extent to be
+         * initialized (in the case where we have written into
+         * one or more preallocated blocks).
+         *
+         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+         * indicate that we are on the delayed allocation path.  This
+         * affects functions in many different parts of the allocation
+         * call path.  This flag exists primarily because we don't
+         * want to change *many* call functions, so ext4_get_blocks()
+         * will set the magic i_delalloc_reserved_flag once the
+         * inode's allocation semaphore is taken.
+         *
+         * If the blocks in questions were delalloc blocks, set
+         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+         * variables are updated after the blocks have been allocated.
+         */
+        new.b_state = 0;
+        get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+                            EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+        if (mpd->b_state & (1 << BH_Delay))
+                get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+                               &new, get_blocks_flags);
+        if (blks < 0) {
+                err = blks;
                /*
                 * If get block returns with error we simply
                 * return. Later writepage will redirty the page and
@@ -2100,12 +2207,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                if (err == -ENOSPC) {
                        ext4_print_free_blocks(mpd->inode);
                }
-                /* invlaidate all the pages */
+                /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
                return err;
        }
-        BUG_ON(new.b_size == 0);
+        BUG_ON(blks == 0);
+        new.b_size = (blks << mpd->inode->i_blkbits);
        if (buffer_new(&new))
                __unmap_underlying_blocks(mpd->inode, &new);
@@ -2118,6 +2227,23 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
            (mpd->b_state & (1 << BH_Unwritten)))
                mpage_put_bnr_to_bhs(mpd, next, &new);
+        if (ext4_should_order_data(mpd->inode)) {
+                err = ext4_jbd2_file_inode(handle, mpd->inode);
+                if (err)
+                        return err;
+        }
+        /*
+         * Update on-disk size along with block allocation.
+         */
+        disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+        if (disksize > i_size_read(mpd->inode))
+                disksize = i_size_read(mpd->inode);
+        if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+                ext4_update_i_disksize(mpd->inode, disksize);
+                return ext4_mark_inode_dirty(handle, mpd->inode);
+        }
        return 0;
 }
@@ -2192,6 +2318,17 @@ flush_it:
        return;
 }
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+        /*
+         * unmapped buffer is possible for holes.
+         * delay buffer is possible with delayed allocation.
+         * We also need to consider unwritten buffer as unmapped.
+         */
+        return (!buffer_mapped(bh) || buffer_delay(bh) ||
+                                buffer_unwritten(bh)) && buffer_dirty(bh);
+}
 /*
 * __mpage_da_writepage - finds extent of pages and blocks
 *
@@ -2276,8 +2413,7 @@ static int __mpage_da_writepage(struct page *page,
                         * Otherwise we won't make progress
                         * with the page in ext4_da_writepage
                         */
-                        if (buffer_dirty(bh) &&
+                        if (ext4_bh_unmapped_or_delay(NULL, bh)) {
-                            (!buffer_mapped(bh) || buffer_delay(bh))) {
                                mpage_add_bh_to_extent(mpd, logical,
                                                       bh->b_size,
                                                       bh->b_state);
@@ -2303,8 +2439,16 @@ static int __mpage_da_writepage(struct page *page,
 }
 /*
- * this is a special callback for ->write_begin() only
+ * This is a special get_blocks_t callback which is used by
- * it's intention is to return mapped block or reserve space
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
 */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                                  struct buffer_head *bh_result, int create)
@@ -2323,7 +2467,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+        ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
        if ((ret == 0) && !buffer_delay(bh_result)) {
                /* the block isn't (pre)allocated yet, let's reserve space */
                /*
@@ -2340,40 +2484,53 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                set_buffer_delay(bh_result);
        } else if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
-                /*
+                if (buffer_unwritten(bh_result)) {
-                 * With sub-block writes into unwritten extents
+                        /* A delayed write to unwritten bh should
-                 * we also need to mark the buffer as new so that
+                         * be marked new and mapped.  Mapped ensures
-                 * the unwritten parts of the buffer gets correctly zeroed.
+                         * that we don't do get_block multiple times
-                 */
+                         * when we write to the same offset and new
-                if (buffer_unwritten(bh_result))
+                         * ensures that we do proper zero out for
+                         * partial write.
+                         */
                        set_buffer_new(bh_result);
+                        set_buffer_mapped(bh_result);
+                }
                ret = 0;
        }
        return ret;
 }
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+/*
-{
+ * This function is used as a standard get_block_t calback function
-        /*
+ * when there is no desire to allocate any blocks.  It is used as a
-         * unmapped buffer is possible for holes.
+ * callback function for block_prepare_write(), nobh_writepage(), and
-         * delay buffer is possible with delayed allocation
+ * block_write_full_page().  These functions should only try to map a
-         */
+ * single block at a time.
-        return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+ *
-}
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
-static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling nobh_writepage() or
+ * block_write_full_page().  Otherwise, b_blocknr could be left
+ * unitialized, and the page write functions will be taken by
+ * surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
        int ret = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
        /*
         * we don't want to do block allocation in writepage
         * so call get_block_wrap with create = 0
         */
-        ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-                                   bh_result, 0, 0, 0);
+        BUG_ON(create && ret == 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -2382,10 +2539,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 }
 /*
- * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * This function can get called via...
- * get called via journal_submit_inode_data_buffers (no journal handle)
+ *   - ext4_da_writepages after taking page lock (have journal handle)
- * get called via shrink_page_list via pdflush (no journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
- * or grab_page_cache when doing write_begin (have journal handle)
+ *   - shrink_page_list via pdflush (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
 */
 static int ext4_da_writepage(struct page *page,
                                struct writeback_control *wbc)
@@ -2436,7 +2594,7 @@ static int ext4_da_writepage(struct page *page,
                 * do block allocation here.
                 */
                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                                ext4_normal_get_block_write);
+                                          noalloc_get_block_write);
                if (!ret) {
                        page_bufs = page_buffers(page);
                        /* check whether all are mapped and non delay */
@@ -2461,11 +2619,10 @@ static int ext4_da_writepage(struct page *page,
        }
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-                ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+                ret = nobh_writepage(page, noalloc_get_block_write, wbc);
        else
-                ret = block_write_full_page(page,
+                ret = block_write_full_page(page, noalloc_get_block_write,
-                                                ext4_normal_get_block_write,
+                                            wbc);
-                                                wbc);
        return ret;
 }
@@ -2777,7 +2934,7 @@ retry:
        *pagep = page;
        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-                                                        ext4_da_get_block_prep);
+                                ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
@@ -2815,7 +2972,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;
-        if (!buffer_mapped(bh) || (buffer_delay(bh)))
+        if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
                return 0;
        return 1;
 }
@@ -3085,12 +3242,10 @@ static int __ext4_normal_writepage(struct page *page,
        struct inode *inode = page->mapping->host;
        if (test_opt(inode->i_sb, NOBH))
-                return nobh_writepage(page,
+                return nobh_writepage(page, noalloc_get_block_write, wbc);
-                                        ext4_normal_get_block_write, wbc);
        else
-                return block_write_full_page(page,
+                return block_write_full_page(page, noalloc_get_block_write,
-                                                ext4_normal_get_block_write,
+                                             wbc);
-                                                wbc);
 }
 static int ext4_normal_writepage(struct page *page,
@@ -3142,7 +3297,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int err;
        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                        ext4_normal_get_block_write);
+                                  noalloc_get_block_write);
        if (ret != 0)
                goto out_unlock;
@@ -3227,9 +3382,8 @@ static int ext4_journalled_writepage(struct page *page,
                 * really know unless we go poke around in the buffer_heads.
                 * But block_write_full_page will do the right thing.
                 */
-                return block_write_full_page(page,
+                return block_write_full_page(page, noalloc_get_block_write,
-                                                ext4_normal_get_block_write,
+                                             wbc);
-                                                wbc);
        }
 no_write:
        redirty_page_for_writepage(wbc, page);
@@ -3973,7 +4127,8 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
-        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+        if (ei->i_disksize && inode->i_size == 0 &&
+            !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4715,25 +4870,6 @@ int ext4_write_inode(struct inode *inode, int wait)
        return ext4_force_commit(inode->i_sb);
 }
-int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
-{
-        int err = 0;
-        mark_buffer_dirty(bh);
-        if (inode && inode_needs_sync(inode)) {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                        ext4_error(inode->i_sb, __func__,
-                                   "IO error syncing inode, "
-                                   "inode=%lu, block=%llu",
-                                   inode->i_ino,
-                                   (unsigned long long)bh->b_blocknr);
-                        err = -EIO;
-                }
-        }
-        return err;
-}
 /*
 * ext4_setattr()
 *
@@ -4930,7 +5066,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 */
 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-        int groups, gdpblocks;
+        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+        int gdpblocks;
        int idxblocks;
        int ret = 0;
@@ -4957,8 +5094,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
                groups += nrblocks;
        gdpblocks = groups;
-        if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
+        if (groups > ngroups)
-                groups = EXT4_SB(inode->i_sb)->s_groups_count;
+                groups = ngroups;
        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
@@ -4998,7 +5135,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677a7984..ed8482e22c0e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
        ext4_set_bit(bit, addr);
 }
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-        addr = mb_correct_addr_and_bit(&bit, addr);
-        ext4_set_bit_atomic(lock, bit, addr);
-}
 static inline void mb_clear_bit(int bit, void *addr)
 {
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_clear_bit(bit, addr);
 }
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-        addr = mb_correct_addr_and_bit(&bit, addr);
-        ext4_clear_bit_atomic(lock, bit, addr);
-}
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
        int fix = 0, ret, tmpmax;
@@ -448,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
-        BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                        ext4_fsblk_t blocknr;
@@ -472,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
-        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -739,6 +727,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 static int ext4_mb_init_cache(struct page *page, char *incore)
 {
+        ext4_group_t ngroups;
        int blocksize;
        int blocks_per_page;
        int groups_per_page;
@@ -757,6 +746,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        inode = page->mapping->host;
        sb = inode->i_sb;
+        ngroups = ext4_get_groups_count(sb);
        blocksize = 1 << inode->i_blkbits;
        blocks_per_page = PAGE_CACHE_SIZE / blocksize;
@@ -780,7 +770,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        for (i = 0; i < groups_per_page; i++) {
                struct ext4_group_desc *desc;
-                if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+                if (first_group + i >= ngroups)
                        break;
                err = -EIO;
@@ -801,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        unlock_buffer(bh[i]);
                        continue;
                }
-                spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                ext4_lock_group(sb, first_group + i);
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        ext4_init_block_bitmap(sb, bh[i],
                                                first_group + i, desc);
                        set_bitmap_uptodate(bh[i]);
                        set_buffer_uptodate(bh[i]);
-                        spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                        ext4_unlock_group(sb, first_group + i);
                        unlock_buffer(bh[i]);
                        continue;
                }
-                spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                ext4_unlock_group(sb, first_group + i);
                if (buffer_uptodate(bh[i])) {
                        /*
                         * if not uninit if bh is uptodate,
@@ -852,7 +842,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                struct ext4_group_info *grinfo;
                group = (first_block + i) >> 1;
-                if (group >= EXT4_SB(sb)->s_groups_count)
+                if (group >= ngroups)
                        break;
                /*
@@ -1078,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
        return 0;
 }
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
@@ -1091,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                if (lock)
+                mb_clear_bit(cur, bm);
-                        mb_clear_bit_atomic(lock, cur, bm);
-                else
-                        mb_clear_bit(cur, bm);
                cur++;
        }
 }
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_set_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
@@ -1112,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-                if (lock)
+                mb_set_bit(cur, bm);
-                        mb_set_bit_atomic(lock, cur, bm);
-                else
-                        mb_set_bit(cur, bm);
                cur++;
        }
 }
@@ -1131,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        struct super_block *sb = e4b->bd_sb;
        BUG_ON(first + count > (sb->s_blocksize << 3));
-        BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_free_blocks_double(inode, e4b, first, count);
@@ -1212,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
        int ord;
        void *buddy;
-        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        BUG_ON(ex == NULL);
        buddy = mb_find_buddy(e4b, order, &max);
@@ -1276,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
        BUG_ON(e4b->bd_group != ex->fe_group);
-        BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_mark_used_double(e4b, start, len);
@@ -1330,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
                e4b->bd_info->bb_counters[ord]++;
        }
-        mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
+        mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
-                        EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
        mb_check_buddy(e4b);
        return ret;
@@ -1726,7 +1709,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        unsigned free, fragments;
        unsigned i, bits;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
-        struct ext4_group_desc *desc;
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        BUG_ON(cr < 0 || cr >= 4);
@@ -1742,10 +1724,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        switch (cr) {
        case 0:
                BUG_ON(ac->ac_2order == 0);
-                /* If this group is uninitialized, skip it initially */
-                desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
-                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-                        return 0;
                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -1788,6 +1766,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
        int block, pnum;
        int blocks_per_page;
        int groups_per_page;
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t first_group;
        struct ext4_group_info *grp;
@@ -1807,7 +1786,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
        /* read all groups the page covers into the cache */
        for (i = 0; i < groups_per_page; i++) {
-                if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+                if ((first_group + i) >= ngroups)
                        break;
                grp = ext4_get_group_info(sb, first_group + i);
                /* take all groups write allocation
@@ -1945,8 +1924,7 @@ err:
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-        ext4_group_t group;
+        ext4_group_t ngroups, group, i;
-        ext4_group_t i;
        int cr;
        int err = 0;
        int bsbits;
@@ -1957,6 +1935,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
+        ngroups = ext4_get_groups_count(sb);
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
        /* first, try the goal */
@@ -2017,11 +1996,11 @@ repeat:
                 */
                group = ac->ac_g_ex.fe_group;
-                for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+                for (i = 0; i < ngroups; group++, i++) {
                        struct ext4_group_info *grp;
                        struct ext4_group_desc *desc;
-                        if (group == EXT4_SB(sb)->s_groups_count)
+                        if (group == ngroups)
                                group = 0;
                        /* quick check to skip empty groups */
@@ -2064,9 +2043,7 @@ repeat:
                        ac->ac_groups_scanned++;
                        desc = ext4_get_group_desc(sb, group, NULL);
-                        if (cr == 0 || (desc->bg_flags &
+                        if (cr == 0)
-                                        cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
-                                        ac->ac_2order != 0))
                                ext4_mb_simple_scan_group(ac, &e4b);
                        else if (cr == 1 &&
                                        ac->ac_g_ex.fe_len == sbi->s_stripe)
@@ -2315,12 +2292,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
        struct super_block *sb = seq->private;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
-        if (*pos < 0 || *pos >= sbi->s_groups_count)
+        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
 }
@@ -2328,11 +2303,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        struct super_block *sb = seq->private;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
        ++*pos;
-        if (*pos < 0 || *pos >= sbi->s_groups_count)
+        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
@@ -2420,7 +2394,8 @@ static void ext4_mb_history_release(struct super_block *sb)
        if (sbi->s_proc != NULL) {
                remove_proc_entry("mb_groups", sbi->s_proc);
-                remove_proc_entry("mb_history", sbi->s_proc);
+                if (sbi->s_mb_history_max)
+                        remove_proc_entry("mb_history", sbi->s_proc);
        }
        kfree(sbi->s_mb_history);
 }
@@ -2431,17 +2406,17 @@ static void ext4_mb_history_init(struct super_block *sb)
        int i;
        if (sbi->s_proc != NULL) {
-                proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+                if (sbi->s_mb_history_max)
-                                 &ext4_mb_seq_history_fops, sb);
+                        proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+                                         &ext4_mb_seq_history_fops, sb);
                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
                                 &ext4_mb_seq_groups_fops, sb);
        }
-        sbi->s_mb_history_max = 1000;
        sbi->s_mb_history_cur = 0;
        spin_lock_init(&sbi->s_mb_history_lock);
        i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-        sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+        sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
        /* if we can't allocate history, then we simple won't use it */
 }
@@ -2451,7 +2426,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_mb_history h;
-        if (unlikely(sbi->s_mb_history == NULL))
+        if (sbi->s_mb_history == NULL)
                return;
        if (!(ac->ac_op & sbi->s_mb_history_filter))
@@ -2587,6 +2562,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
 static int ext4_mb_init_backend(struct super_block *sb)
 {
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int metalen;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2574,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        struct ext4_group_desc *desc;
        /* This is the number of blocks used by GDT */
-        num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
                                1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
        /*
@@ -2644,7 +2620,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        for (i = 0; i < num_meta_group_infos; i++) {
                if ((i + 1) == num_meta_group_infos)
                        metalen = sizeof(*meta_group_info) *
-                                (sbi->s_groups_count -
+                                (ngroups -
                                        (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
                meta_group_info = kmalloc(metalen, GFP_KERNEL);
                if (meta_group_info == NULL) {
@@ -2655,7 +2631,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                sbi->s_group_info[i] = meta_group_info;
        }
-        for (i = 0; i < sbi->s_groups_count; i++) {
+        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
@@ -2761,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        return 0;
 }
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 {
        struct ext4_prealloc_space *pa;
@@ -2781,13 +2757,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 int ext4_mb_release(struct super_block *sb)
 {
+        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        if (sbi->s_group_info) {
-                for (i = 0; i < sbi->s_groups_count; i++) {
+                for (i = 0; i < ngroups; i++) {
                        grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
                        kfree(grinfo->bb_bitmap);
@@ -2797,7 +2774,7 @@ int ext4_mb_release(struct super_block *sb)
                        ext4_unlock_group(sb, i);
                        kfree(grinfo);
                }
-                num_meta_group_infos = (sbi->s_groups_count +
+                num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                for (i = 0; i < num_meta_group_infos; i++)
@@ -2984,27 +2961,25 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                + le32_to_cpu(es->s_first_data_block);
        len = ac->ac_b_ex.fe_len;
-        if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
+        if (!ext4_data_block_valid(sbi, block, len)) {
-            in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
-            in_range(block, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group) ||
-            in_range(block + len - 1, ext4_inode_table(sb, gdp),
-                     EXT4_SB(sb)->s_itb_per_group)) {
                ext4_error(sb, __func__,
-                           "Allocating block %llu in system zone of %d group\n",
+                           "Allocating blocks %llu-%llu which overlap "
-                           block, ac->ac_b_ex.fe_group);
+                           "fs metadata\n", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
                 * We leak some of the blocks here.
                 */
-                mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
+                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
-                                bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+                mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-                                ac->ac_b_ex.fe_len);
+                            ac->ac_b_ex.fe_len);
+                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
                        err = -EAGAIN;
                goto out_err;
        }
+        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
        {
                int i;
@@ -3014,9 +2989,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                }
        }
 #endif
-        spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
-        mb_set_bits(NULL, bitmap_bh->b_data,
-                                ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                ext4_free_blks_set(sb, gdp,
@@ -3026,7 +2999,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
        ext4_free_blks_set(sb, gdp, len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
-        spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
        /*
         * Now reduce the dirty block count also. Should not go negative
@@ -3459,7 +3433,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 * the function goes through all block freed in the group
 * but not yet committed and marks them used in in-core bitmap.
 * buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with the ext4 group lock held
 */
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group)
@@ -3473,9 +3447,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
        while (n) {
                entry = rb_entry(n, struct ext4_free_data, node);
-                mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                mb_set_bits(bitmap, entry->start_blk, entry->count);
-                                bitmap, entry->start_blk,
-                                entry->count);
                n = rb_next(n);
        }
        return;
@@ -3484,7 +3456,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 /*
 * the function goes through all preallocation in this group and marks them
 * used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
 */
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group)
@@ -3516,8 +3488,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                if (unlikely(len == 0))
                        continue;
                BUG_ON(groupnr != group);
-                mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+                mb_set_bits(bitmap, start, len);
-                                                bitmap, start, len);
                preallocated += len;
                count++;
        }
@@ -4121,7 +4092,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
-        ext4_group_t i;
+        ext4_group_t ngroups, i;
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
                        " Allocation context details:\n");
@@ -4145,7 +4116,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
        printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
                ac->ac_found);
        printk(KERN_ERR "EXT4-fs: groups: \n");
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+        ngroups = ext4_get_groups_count(sb);
+        for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
                struct ext4_prealloc_space *pa;
                ext4_grpblk_t start;
@@ -4469,13 +4441,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 {
-        ext4_group_t i;
+        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        int ret;
        int freed = 0;
        trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
                   sb->s_id, needed);
-        for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+        for (i = 0; i < ngroups && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
                freed += ret;
                needed -= ret;
@@ -4859,29 +4831,25 @@ do_more:
                new_entry->group  = block_group;
                new_entry->count = count;
                new_entry->t_tid = handle->h_transaction->t_tid;
                ext4_lock_group(sb, block_group);
-                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                mb_clear_bits(bitmap_bh->b_data, bit, count);
-                                bit, count);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
-                ext4_unlock_group(sb, block_group);
        } else {
-                ext4_lock_group(sb, block_group);
                /* need to update group_info->bb_free and bitmap
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
-                mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+                ext4_lock_group(sb, block_group);
-                                bit, count);
+                mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-                ext4_unlock_group(sb, block_group);
        }
-        spin_lock(sb_bgl_lock(sbi, block_group));
        ret = ext4_free_blks_count(sb, gdp) + count;
        ext4_free_blks_set(sb, gdp, ret);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-        spin_unlock(sb_bgl_lock(sbi, block_group));
+        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
        if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd9e6cd5f6cf..75e34f69215b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
-#include "group.h"
 /*
 * with AGGRESSIVE_CHECK allocator runs consistency checks over
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 22098e1cd085..07eb6649e4fa 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -37,7 +37,6 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "namei.h"
 #include "xattr.h"
 #include "acl.h"
@@ -750,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                        ext4fs_dirhash(de->name, de->name_len, &h);
                        map_tail--;
                        map_tail->hash = h.hash;
-                        map_tail->offs = (u16) ((char *) de - base);
+                        map_tail->offs = ((char *) de - base)>>2;
                        map_tail->size = le16_to_cpu(de->rec_len);
                        count++;
                        cond_resched();
@@ -1148,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
        unsigned rec_len = 0;
        while (count--) {
-                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+                                                (from + (map->offs<<2));
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
@@ -1997,7 +1997,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        if (!ext4_handle_valid(handle))
                return 0;
-        lock_super(sb);
+        mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
        if (!list_empty(&EXT4_I(inode)->i_orphan))
                goto out_unlock;
@@ -2006,9 +2006,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        /* @@@ FIXME: Observation from aviro:
         * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
-         * here (on lock_super()), so race with ext4_link() which might bump
+         * here (on s_orphan_lock), so race with ext4_link() which might bump
         * ->i_nlink. For, say it, character device. Not a regular file,
         * not a directory, not a symlink and ->i_nlink > 0.
+         *
+         * tytso, 4/25/2009: I'm not sure how that could happen;
+         * shouldn't the fs core protect us from these sort of
+         * unlink()/link() races?
         */
        J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2049,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        jbd_debug(4, "orphan inode %lu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-        unlock_super(sb);
+        mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
        ext4_std_error(inode->i_sb, err);
        return err;
 }
@@ -2066,11 +2070,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
        if (!ext4_handle_valid(handle))
                return 0;
-        lock_super(inode->i_sb);
+        mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
-        if (list_empty(&ei->i_orphan)) {
+        if (list_empty(&ei->i_orphan))
-                unlock_super(inode->i_sb);
+                goto out;
-                return 0;
-        }
        ino_next = NEXT_ORPHAN(inode);
        prev = ei->i_orphan.prev;
@@ -2120,7 +2122,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 out_err:
        ext4_std_error(inode->i_sb, err);
 out:
-        unlock_super(inode->i_sb);
+        mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
        return err;
 out_brelse:
@@ -2533,6 +2535,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .permission     = ext4_permission,
+        .fiemap         = ext4_fiemap,
 };
 const struct inode_operations ext4_special_inode_operations = {
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a00..000000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*  linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- *      Ben Dooks <ben@simtec.co.uk>
- *
-*/
-extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 546c7dd869e1..27eb289eea37 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include "ext4_jbd2.h"
-#include "group.h"
 #define outside(b, first, last) ((b) < (first) || (b) >= (last))
 #define inside(b, first, last)  ((b) >= (first) && (b) < (last))
@@ -193,7 +192,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        if (IS_ERR(handle))
                return PTR_ERR(handle);
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                err = -EBUSY;
                goto exit_journal;
@@ -302,7 +301,7 @@ exit_bh:
        brelse(bh);
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
@@ -643,11 +642,12 @@ exit_free:
 * important part is that the new block and inode counts are in the backup
 * superblocks, and the location of the new group metadata in the GDT backups.
 *
- * We do not need lock_super() for this, because these blocks are not
+ * We do not need take the s_resize_lock for this, because these
- * otherwise touched by the filesystem code when it is mounted.  We don't
+ * blocks are not otherwise touched by the filesystem code when it is
- * need to worry about last changing from sbi->s_groups_count, because the
+ * mounted.  We don't need to worry about last changing from
- * worst that can happen is that we do not copy the full number of backups
+ * sbi->s_groups_count, because the worst that can happen is that we
- * at this time.  The resize which changed s_groups_count will backup again.
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
 */
 static void update_backups(struct super_block *sb,
                           int blk_off, char *data, int size)
@@ -809,7 +809,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                ext4_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
@@ -840,7 +840,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /*
         * OK, now we've set up the new group.  Time to make it active.
         *
-         * Current kernels don't lock all allocations via lock_super(),
+         * We do not lock all allocations via s_resize_lock
         * so we have to be safe wrt. concurrent accesses the group
         * data.  So we need to be careful to set all of the relevant
         * group descriptor data etc. *before* we enable the group.
@@ -900,12 +900,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         *
         * The precise rules we use are:
         *
-         * * Writers of s_groups_count *must* hold lock_super
+         * * Writers of s_groups_count *must* hold s_resize_lock
         * AND
         * * Writers must perform a smp_wmb() after updating all dependent
         *   data and before modifying the groups count
         *
-         * * Readers must hold lock_super() over the access
+         * * Readers must hold s_resize_lock over the access
         * OR
         * * Readers must perform an smp_rmb() after reading the groups count
         *   and before reading any dependent data.
@@ -948,7 +948,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        sb->s_dirt = 1;
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
        if (!err) {
@@ -986,7 +986,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
-         * taking lock_super() below. */
+         * taking the s_resize_lock below. */
        o_blocks_count = ext4_blocks_count(es);
        o_groups_count = EXT4_SB(sb)->s_groups_count;
@@ -1056,11 +1056,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&EXT4_SB(sb)->s_resize_lock);
        if (o_blocks_count != ext4_blocks_count(es)) {
                ext4_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
-                unlock_super(sb);
+                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                err = -EBUSY;
                goto exit_put;
@@ -1070,14 +1070,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                                                 EXT4_SB(sb)->s_sbh))) {
                ext4_warning(sb, __func__,
                             "error %d on journal write access", err);
-                unlock_super(sb);
+                mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        sb->s_dirt = 1;
-        unlock_super(sb);
+        mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        /* We add the blocks to the bitmap and set the group need init bit */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f222..012c4251397e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>
 #include <linux/jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -45,16 +46,20 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "namei.h"
-#include "group.h"
+static int default_mb_history_length = 1000;
+module_param_named(default_mb_history_length, default_mb_history_length,
+                   int, 0644);
+MODULE_PARM_DESC(default_mb_history_length,
+                 "Default number of entries saved for mb_history");
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
-static int ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb, int sync);
-                              struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
 static void ext4_clear_journal_err(struct super_block *sb,
@@ -74,7 +79,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_block_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -82,7 +87,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_inode_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -90,7 +95,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_inode_table_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+                 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 __u32 ext4_free_blks_count(struct super_block *sb,
@@ -98,7 +103,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 }
 __u32 ext4_free_inodes_count(struct super_block *sb,
@@ -106,7 +111,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_free_inodes_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 }
 __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -114,7 +119,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 }
 __u32 ext4_itable_unused_count(struct super_block *sb,
@@ -122,7 +127,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_itable_unused_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-                (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+                 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 }
 void ext4_block_bitmap_set(struct super_block *sb,
@@ -202,8 +207,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        journal = EXT4_SB(sb)->s_journal;
        if (journal) {
                if (is_journal_aborted(journal)) {
-                        ext4_abort(sb, __func__,
+                        ext4_abort(sb, __func__, "Detected aborted journal");
-                                   "Detected aborted journal");
                        return ERR_PTR(-EROFS);
                }
                return jbd2_journal_start(journal, nblocks);
@@ -302,10 +306,10 @@ static void ext4_handle_error(struct super_block *sb)
                        jbd2_journal_abort(journal, -EIO);
        }
        if (test_opt(sb, ERRORS_RO)) {
-                printk(KERN_CRIT "Remounting filesystem read-only\n");
+                ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
                sb->s_flags |= MS_RDONLY;
        }
-        ext4_commit_super(sb, es, 1);
+        ext4_commit_super(sb, 1);
        if (test_opt(sb, ERRORS_PANIC))
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
@@ -395,8 +399,6 @@ void ext4_abort(struct super_block *sb, const char *function,
 {
        va_list args;
-        printk(KERN_CRIT "ext4_abort called.\n");
        va_start(args, fmt);
        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
        vprintk(fmt, args);
@@ -409,7 +411,7 @@ void ext4_abort(struct super_block *sb, const char *function,
        if (sb->s_flags & MS_RDONLY)
                return;
-        printk(KERN_CRIT "Remounting filesystem read-only\n");
+        ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        sb->s_flags |= MS_RDONLY;
        EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
@@ -417,6 +419,18 @@ void ext4_abort(struct super_block *sb, const char *function,
                jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
+void ext4_msg (struct super_block * sb, const char *prefix,
+                   const char *fmt, ...)
+{
+        va_list args;
+        va_start(args, fmt);
+        printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+        vprintk(fmt, args);
+        printk("\n");
+        va_end(args);
+}
 void ext4_warning(struct super_block *sb, const char *function,
                  const char *fmt, ...)
 {
@@ -431,7 +445,7 @@ void ext4_warning(struct super_block *sb, const char *function,
 }
 void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
-                                const char *function, const char *fmt, ...)
+                           const char *function, const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
@@ -447,7 +461,7 @@ __acquires(bitlock)
        if (test_opt(sb, ERRORS_CONT)) {
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                ext4_commit_super(sb, es, 0);
+                ext4_commit_super(sb, 0);
                return;
        }
        ext4_unlock_group(sb, grp);
@@ -467,7 +481,6 @@ __acquires(bitlock)
        return;
 }
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -496,7 +509,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 /*
 * Open the external journal device
 */
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 {
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
@@ -507,7 +520,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
        return bdev;
 fail:
-        printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
+        ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
                        __bdevname(dev, b), PTR_ERR(bdev));
        return NULL;
 }
@@ -543,8 +556,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 {
        struct list_head *l;
-        printk(KERN_ERR "sb orphan head is %d\n",
+        ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
-               le32_to_cpu(sbi->s_es->s_last_orphan));
+                 le32_to_cpu(sbi->s_es->s_last_orphan));
        printk(KERN_ERR "sb_info orphan list:\n");
        list_for_each(l, &sbi->s_orphan) {
@@ -563,6 +576,12 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        lock_super(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                ext4_commit_super(sb, 1);
+        ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
@@ -576,7 +595,7 @@ static void ext4_put_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
        }
        if (sbi->s_proc) {
                remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -586,7 +605,10 @@ static void ext4_put_super(struct super_block *sb)
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
        kfree(sbi->s_group_desc);
-        kfree(sbi->s_flex_groups);
+        if (is_vmalloc_addr(sbi->s_flex_groups))
+                vfree(sbi->s_flex_groups);
+        else
+                kfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -625,11 +647,8 @@ static void ext4_put_super(struct super_block *sb)
        unlock_super(sb);
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
-        lock_super(sb);
-        lock_kernel();
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-        return;
 }
 static struct kmem_cache *ext4_inode_cachep;
@@ -644,6 +663,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        ei->i_acl = EXT4_ACL_NOT_CACHED;
        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -664,14 +684,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_allocated_meta_blocks = 0;
        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
        return &ei->vfs_inode;
 }
 static void ext4_destroy_inode(struct inode *inode)
 {
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
-                printk("EXT4 Inode %p: orphan list check failed!\n",
+                ext4_msg(inode->i_sb, KERN_ERR,
-                        EXT4_I(inode));
+                         "Inode %lu (%p): orphan list check failed!",
+                         inode->i_ino, EXT4_I(inode));
                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
                                EXT4_I(inode), sizeof(struct ext4_inode_info),
                                true);
@@ -870,12 +892,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noauto_da_alloc");
        ext4_show_quota_options(seq, sb);
        return 0;
 }
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
-                u64 ino, u32 generation)
+                                        u64 ino, u32 generation)
 {
        struct inode *inode;
@@ -904,14 +926,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 }
 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
-                int fh_len, int fh_type)
+                                        int fh_len, int fh_type)
 {
        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
 }
 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
-                int fh_len, int fh_type)
+                                        int fh_len, int fh_type)
 {
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
@@ -923,7 +945,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 * which would prevent try_to_free_buffers() from freeing them, we must use
 * jbd2 layer's try_to_free_buffers() function to release them.
 */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+                                 gfp_t wait)
 {
        journal_t *journal = EXT4_SB(sb)->s_journal;
@@ -992,7 +1015,6 @@ static const struct super_operations ext4_sops = {
        .dirty_inode    = ext4_dirty_inode,
        .delete_inode   = ext4_delete_inode,
        .put_super      = ext4_put_super,
-        .write_super    = ext4_write_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs      = ext4_freeze,
        .unfreeze_fs    = ext4_unfreeze,
@@ -1007,6 +1029,25 @@ static const struct super_operations ext4_sops = {
        .bdev_try_to_free_page = bdev_try_to_free_page,
 };
+static const struct super_operations ext4_nojournal_sops = {
+        .alloc_inode    = ext4_alloc_inode,
+        .destroy_inode  = ext4_destroy_inode,
+        .write_inode    = ext4_write_inode,
+        .dirty_inode    = ext4_dirty_inode,
+        .delete_inode   = ext4_delete_inode,
+        .write_super    = ext4_write_super,
+        .put_super      = ext4_put_super,
+        .statfs         = ext4_statfs,
+        .remount_fs     = ext4_remount,
+        .clear_inode    = ext4_clear_inode,
+        .show_options   = ext4_show_options,
+#ifdef CONFIG_QUOTA
+        .quota_read     = ext4_quota_read,
+        .quota_write    = ext4_quota_write,
+#endif
+        .bdev_try_to_free_page = bdev_try_to_free_page,
+};
 static const struct export_operations ext4_export_ops = {
        .fh_to_dentry = ext4_fh_to_dentry,
        .fh_to_parent = ext4_fh_to_parent,
@@ -1023,12 +1064,13 @@ enum {
        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-        Opt_data_err_abort, Opt_data_err_ignore,
+        Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+        Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1069,6 +1111,7 @@ static const match_table_t tokens = {
        {Opt_data_writeback, "data=writeback"},
        {Opt_data_err_abort, "data_err=abort"},
        {Opt_data_err_ignore, "data_err=ignore"},
+        {Opt_mb_history_length, "mb_history_length=%u"},
        {Opt_offusrjquota, "usrjquota="},
        {Opt_usrjquota, "usrjquota=%s"},
        {Opt_offgrpjquota, "grpjquota="},
@@ -1087,6 +1130,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+        {Opt_block_validity, "block_validity"},
+        {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
        {Opt_journal_ioprio, "journal_ioprio=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1102,8 +1147,9 @@ static ext4_fsblk_t get_sb_block(void **data)
        if (!options || strncmp(options, "sb=", 3) != 0)
                return 1;       /* Default location */
        options += 3;
-        /*todo: use simple_strtoll with >32bit ext4 */
+        /* TODO: use simple_strtoll with >32bit ext4 */
        sb_block = simple_strtoul(options, &options, 0);
        if (*options && *options != ',') {
                printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1113,6 +1159,7 @@ static ext4_fsblk_t get_sb_block(void **data)
        if (*options == ',')
                options++;
        *data = (void *) options;
        return sb_block;
 }
@@ -1206,8 +1253,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_user_xattr:
                case Opt_nouser_xattr:
-                        printk(KERN_ERR "EXT4 (no)user_xattr options "
+                        ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
-                               "not supported\n");
                        break;
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1220,8 +1266,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_acl:
                case Opt_noacl:
-                        printk(KERN_ERR "EXT4 (no)acl options "
+                        ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
-                               "not supported\n");
                        break;
 #endif
                case Opt_journal_update:
@@ -1231,16 +1276,16 @@ static int parse_options(char *options, struct super_block *sb,
                           user to specify an existing inode to be the
                           journal file. */
                        if (is_remount) {
-                                printk(KERN_ERR "EXT4-fs: cannot specify "
+                                ext4_msg(sb, KERN_ERR,
-                                       "journal on remount\n");
+                                         "Cannot specify journal on remount");
                                return 0;
                        }
                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
-                                printk(KERN_ERR "EXT4-fs: cannot specify "
+                                ext4_msg(sb, KERN_ERR,
-                                       "journal on remount\n");
+                                        "Cannot specify journal on remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option))
@@ -1294,9 +1339,8 @@ static int parse_options(char *options, struct super_block *sb,
                        if (is_remount) {
                                if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
                                                != data_opt) {
-                                        printk(KERN_ERR
+                                        ext4_msg(sb, KERN_ERR,
-                                                "EXT4-fs: cannot change data "
+                                                "Cannot change data mode on remount");
-                                                "mode on remount\n");
                                        return 0;
                                }
                        } else {
@@ -1310,6 +1354,13 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_data_err_ignore:
                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
                        break;
+                case Opt_mb_history_length:
+                        if (match_int(&args[0], &option))
+                                return 0;
+                        if (option < 0)
+                                return 0;
+                        sbi->s_mb_history_max = option;
+                        break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
                        qtype = USRQUOTA;
@@ -1319,31 +1370,31 @@ static int parse_options(char *options, struct super_block *sb,
 set_qf_name:
                        if (sb_any_quota_loaded(sb) &&
                            !sbi->s_qf_names[qtype]) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                       "EXT4-fs: Cannot change journaled "
+                                       "Cannot change journaled "
-                                       "quota options when quota turned on.\n");
+                                       "quota options when quota turned on");
                                return 0;
                        }
                        qname = match_strdup(&args[0]);
                        if (!qname) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: not enough memory for "
+                                        "Not enough memory for "
-                                        "storing quotafile name.\n");
+                                        "storing quotafile name");
                                return 0;
                        }
                        if (sbi->s_qf_names[qtype] &&
                            strcmp(sbi->s_qf_names[qtype], qname)) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: %s quota file already "
+                                        "%s quota file already "
-                                        "specified.\n", QTYPE2NAME(qtype));
+                                        "specified", QTYPE2NAME(qtype));
                                kfree(qname);
                                return 0;
                        }
                        sbi->s_qf_names[qtype] = qname;
                        if (strchr(sbi->s_qf_names[qtype], '/')) {
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: quotafile must be on "
+                                        "quotafile must be on "
-                                        "filesystem root.\n");
+                                        "filesystem root");
                                kfree(sbi->s_qf_names[qtype]);
                                sbi->s_qf_names[qtype] = NULL;
                                return 0;
@@ -1358,9 +1409,9 @@ set_qf_name:
 clear_qf_name:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_qf_names[qtype]) {
-                                printk(KERN_ERR "EXT4-fs: Cannot change "
+                                ext4_msg(sb, KERN_ERR, "Cannot change "
                                        "journaled quota options when "
-                                        "quota turned on.\n");
+                                        "quota turned on");
                                return 0;
                        }
                        /*
@@ -1377,9 +1428,9 @@ clear_qf_name:
 set_qf_format:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_jquota_fmt != qfmt) {
-                                printk(KERN_ERR "EXT4-fs: Cannot change "
+                                ext4_msg(sb, KERN_ERR, "Cannot change "
                                        "journaled quota options when "
-                                        "quota turned on.\n");
+                                        "quota turned on");
                                return 0;
                        }
                        sbi->s_jquota_fmt = qfmt;
@@ -1395,8 +1446,8 @@ set_qf_format:
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
-                                printk(KERN_ERR "EXT4-fs: Cannot change quota "
+                                ext4_msg(sb, KERN_ERR, "Cannot change quota "
-                                        "options when quota turned on.\n");
+                                        "options when quota turned on");
                                return 0;
                        }
                        clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1407,8 +1458,8 @@ set_qf_format:
                case Opt_quota:
                case Opt_usrquota:
                case Opt_grpquota:
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                                "EXT4-fs: quota options not supported.\n");
+                                "quota options not supported");
                        break;
                case Opt_usrjquota:
                case Opt_grpjquota:
@@ -1416,9 +1467,8 @@ set_qf_format:
                case Opt_offgrpjquota:
                case Opt_jqfmt_vfsold:
                case Opt_jqfmt_vfsv0:
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                                "EXT4-fs: journaled quota options not "
+                                "journaled quota options not supported");
-                                "supported.\n");
                        break;
                case Opt_noquota:
                        break;
@@ -1443,8 +1493,9 @@ set_qf_format:
                        break;
                case Opt_resize:
                        if (!is_remount) {
-                                printk("EXT4-fs: resize option only available "
+                                ext4_msg(sb, KERN_ERR,
-                                        "for remount\n");
+                                        "resize option only available "
+                                        "for remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option) != 0)
@@ -1474,14 +1525,21 @@ set_qf_format:
                case Opt_delalloc:
                        set_opt(sbi->s_mount_opt, DELALLOC);
                        break;
+                case Opt_block_validity:
+                        set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        break;
+                case Opt_noblock_validity:
+                        clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                        break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
-                        if (option & (option - 1)) {
+                        if (!is_power_of_2(option)) {
-                                printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+                                ext4_msg(sb, KERN_ERR,
-                                       " must be a power of 2\n");
+                                         "EXT4-fs: inode_readahead_blks"
+                                         " must be a power of 2");
                                return 0;
                        }
                        sbi->s_inode_readahead_blks = option;
@@ -1508,9 +1566,9 @@ set_qf_format:
                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
                        break;
                default:
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: Unrecognized mount option \"%s\" "
+                               "Unrecognized mount option \"%s\" "
-                               "or missing value\n", p);
+                               "or missing value", p);
                        return 0;
                }
        }
@@ -1528,21 +1586,21 @@ set_qf_format:
                                (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
                    (sbi->s_qf_names[GRPQUOTA] &&
                                (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
-                        printk(KERN_ERR "EXT4-fs: old and new quota "
+                        ext4_msg(sb, KERN_ERR, "old and new quota "
-                                        "format mixing.\n");
+                                        "format mixing");
                        return 0;
                }
                if (!sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT4-fs: journaled quota format "
+                        ext4_msg(sb, KERN_ERR, "journaled quota format "
-                                        "not specified.\n");
+                                        "not specified");
                        return 0;
                }
        } else {
                if (sbi->s_jquota_fmt) {
-                        printk(KERN_ERR "EXT4-fs: journaled quota format "
+                        ext4_msg(sb, KERN_ERR, "journaled quota format "
                                        "specified with no journaling "
-                                        "enabled.\n");
+                                        "enabled");
                        return 0;
                }
        }
@@ -1557,32 +1615,32 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        int res = 0;
        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-                printk(KERN_ERR "EXT4-fs warning: revision level too high, "
+                ext4_msg(sb, KERN_ERR, "revision level too high, "
-                       "forcing read-only mode\n");
+                         "forcing read-only mode");
                res = MS_RDONLY;
        }
        if (read_only)
                return res;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
-                printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
+                ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
        else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: mounting fs with errors, "
+                         "warning: mounting fs with errors, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: maximal mount count reached, "
+                         "warning: maximal mount count reached, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
                (le32_to_cpu(es->s_lastcheck) +
                        le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: checktime reached, "
+                         "warning: checktime reached, "
-                       "running e2fsck is recommended\n");
+                         "running e2fsck is recommended");
-        if (!sbi->s_journal) 
+        if (!sbi->s_journal)
                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1592,7 +1650,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        if (sbi->s_journal)
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-        ext4_commit_super(sb, es, 1);
+        ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
@@ -1603,11 +1661,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        sbi->s_mount_opt);
        if (EXT4_SB(sb)->s_journal) {
-                printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+                ext4_msg(sb, KERN_INFO, "%s journal on %s",
-                       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+                       EXT4_SB(sb)->s_journal->j_inode ? "internal" :
                       "external", EXT4_SB(sb)->s_journal->j_devname);
        } else {
-                printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+                ext4_msg(sb, KERN_INFO, "no journal");
        }
        return res;
 }
@@ -1616,10 +1674,10 @@ static int ext4_fill_flex_info(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
-        struct buffer_head *bh;
        ext4_group_t flex_group_count;
        ext4_group_t flex_group;
        int groups_per_flex = 0;
+        size_t size;
        int i;
        if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,16 +1692,21 @@ static int ext4_fill_flex_info(struct super_block *sb)
        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
                        ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
                              EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-        sbi->s_flex_groups = kzalloc(flex_group_count *
+        size = flex_group_count * sizeof(struct flex_groups);
-                                     sizeof(struct flex_groups), GFP_KERNEL);
+        sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+        if (sbi->s_flex_groups == NULL) {
+                sbi->s_flex_groups = vmalloc(size);
+                if (sbi->s_flex_groups)
+                        memset(sbi->s_flex_groups, 0, size);
+        }
        if (sbi->s_flex_groups == NULL) {
-                printk(KERN_ERR "EXT4-fs: not enough memory for "
+                ext4_msg(sb, KERN_ERR, "not enough memory for "
-                                "%u flex groups\n", flex_group_count);
+                                "%u flex groups", flex_group_count);
                goto failed;
        }
        for (i = 0; i < sbi->s_groups_count; i++) {
-                gdp = ext4_get_group_desc(sb, i, &bh);
+                gdp = ext4_get_group_desc(sb, i, NULL);
                flex_group = ext4_flex_group(sbi, i);
                atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
@@ -1724,44 +1787,44 @@ static int ext4_check_descriptors(struct super_block *sb)
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Block bitmap for group %u not in group "
-                               "(block %llu)!\n", i, block_bitmap);
+                               "(block %llu)!", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode bitmap for group %u not in group "
-                               "(block %llu)!\n", i, inode_bitmap);
+                               "(block %llu)!", i, inode_bitmap);
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode table for group %u not in group "
-                               "(block %llu)!\n", i, inode_table);
+                               "(block %llu)!", i, inode_table);
                        return 0;
                }
-                spin_lock(sb_bgl_lock(sbi, i));
+                ext4_lock_group(sb, i);
                if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
-                        printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
-                               "Checksum for group %u failed (%u!=%u)\n",
+                                 "Checksum for group %u failed (%u!=%u)",
-                               i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+                                 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-                               gdp)), le16_to_cpu(gdp->bg_checksum));
+                                     gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!(sb->s_flags & MS_RDONLY)) {
-                                spin_unlock(sb_bgl_lock(sbi, i));
+                                ext4_unlock_group(sb, i);
                                return 0;
                        }
                }
-                spin_unlock(sb_bgl_lock(sbi, i));
+                ext4_unlock_group(sb, i);
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-        sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+        sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
        return 1;
 }
@@ -1796,8 +1859,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        }
        if (bdev_read_only(sb->s_bdev)) {
-                printk(KERN_ERR "EXT4-fs: write access "
+                ext4_msg(sb, KERN_ERR, "write access "
-                        "unavailable, skipping orphan cleanup.\n");
+                        "unavailable, skipping orphan cleanup");
                return;
        }
@@ -1811,8 +1874,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        }
        if (s_flags & MS_RDONLY) {
-                printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
+                ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
-                       sb->s_id);
                sb->s_flags &= ~MS_RDONLY;
        }
 #ifdef CONFIG_QUOTA
@@ -1823,9 +1885,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                if (EXT4_SB(sb)->s_qf_names[i]) {
                        int ret = ext4_quota_on_mount(sb, i);
                        if (ret < 0)
-                                printk(KERN_ERR
+                                ext4_msg(sb, KERN_ERR,
-                                        "EXT4-fs: Cannot turn on journaled "
+                                        "Cannot turn on journaled "
-                                        "quota: error %d\n", ret);
+                                        "quota: error %d", ret);
                }
        }
 #endif
@@ -1842,16 +1904,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
                vfs_dq_init(inode);
                if (inode->i_nlink) {
-                        printk(KERN_DEBUG
+                        ext4_msg(sb, KERN_DEBUG,
-                                "%s: truncating inode %lu to %lld bytes\n",
+                                "%s: truncating inode %lu to %lld bytes",
                                __func__, inode->i_ino, inode->i_size);
                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
                        ext4_truncate(inode);
                        nr_truncates++;
                } else {
-                        printk(KERN_DEBUG
+                        ext4_msg(sb, KERN_DEBUG,
-                                "%s: deleting unreferenced inode %lu\n",
+                                "%s: deleting unreferenced inode %lu",
                                __func__, inode->i_ino);
                        jbd_debug(2, "deleting unreferenced inode %lu\n",
                                  inode->i_ino);
@@ -1863,11 +1925,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
        if (nr_orphans)
-                printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
+                ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
-                       sb->s_id, PLURAL(nr_orphans));
+                       PLURAL(nr_orphans));
        if (nr_truncates)
-                printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
+                ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
-                       sb->s_id, PLURAL(nr_truncates));
+                       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
@@ -1877,6 +1939,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
 /*
 * Maximal extent format file size.
 * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1927,19 +1990,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
        loff_t res = EXT4_NDIR_BLOCKS;
        int meta_blocks;
        loff_t upper_limit;
-        /* This is calculated to be the largest file size for a
+        /* This is calculated to be the largest file size for a dense, block
-         * dense, bitmapped file such that the total number of
+         * mapped file such that the file's total number of 512-byte sectors,
-         * sectors in the file, including data and all indirect blocks,
+         * including data and all indirect blocks, does not exceed (2^48 - 1).
-         * does not exceed 2^48 -1
+         *
-         * __u32 i_blocks_lo and _u16 i_blocks_high representing the
+         * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
-         * total number of  512 bytes blocks of the file
+         * number of 512-byte sectors of the file.
         */
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                 * !has_huge_files or CONFIG_LBD is not enabled
+                 * !has_huge_files or CONFIG_LBD not enabled implies that
-                 * implies the inode i_block represent total blocks in
+                 * the inode i_block field represents total file blocks in
-                 * 512 bytes 32 == size of vfs inode i_blocks * 8
+                 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
                 */
                upper_limit = (1LL << 32) - 1;
@@ -1981,7 +2044,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 }
 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
-                                ext4_fsblk_t logical_sb_block, int nr)
+                                   ext4_fsblk_t logical_sb_block, int nr)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t bg, first_meta_bg;
@@ -1995,6 +2058,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
        bg = sbi->s_desc_per_block * nr;
        if (ext4_bg_has_super(sb, bg))
                has_super = 1;
        return (has_super + ext4_group_first_block_no(sb, bg));
 }
@@ -2091,8 +2155,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
        if (parse_strtoul(buf, 0x40000000, &t))
                return -EINVAL;
-        /* inode_readahead_blks must be a power of 2 */
+        if (!is_power_of_2(t))
-        if (t & (t-1))
                return -EINVAL;
        sbi->s_inode_readahead_blks = t;
@@ -2100,7 +2163,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 }
 static ssize_t sbi_ui_show(struct ext4_attr *a,
-                                struct ext4_sb_info *sbi, char *buf)
+                           struct ext4_sb_info *sbi, char *buf)
 {
        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
@@ -2205,7 +2268,6 @@ static struct kobj_type ext4_ktype = {
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
 {
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
@@ -2256,7 +2318,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
-                printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+                ext4_msg(sb, KERN_ERR, "unable to set blocksize");
                goto out_fail;
        }
@@ -2272,7 +2334,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (!(bh = sb_bread(sb, logical_sb_block))) {
-                printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
+                ext4_msg(sb, KERN_ERR, "unable to read superblock");
                goto out_fail;
        }
        /*
@@ -2321,6 +2383,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+        sbi->s_mb_history_max = default_mb_history_length;
        set_opt(sbi->s_mount_opt, BARRIER);
@@ -2330,7 +2393,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        set_opt(sbi->s_mount_opt, DELALLOC);
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
@@ -2342,9 +2404,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
             EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-                printk(KERN_WARNING
+                ext4_msg(sb, KERN_WARNING,
-                       "EXT4-fs warning: feature flags set on rev 0 fs, "
+                       "feature flags set on rev 0 fs, "
-                       "running e2fsck is recommended\n");
+                       "running e2fsck is recommended");
        /*
         * Check feature flags regardless of the revision level, since we
@@ -2353,16 +2415,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
        if (features) {
-                printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
+                ext4_msg(sb, KERN_ERR,
-                       "unsupported optional features (%x).\n", sb->s_id,
+                        "Couldn't mount because of "
+                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                goto failed_mount;
        }
        features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
        if (!(sb->s_flags & MS_RDONLY) && features) {
-                printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
+                ext4_msg(sb, KERN_ERR,
-                       "unsupported optional features (%x).\n", sb->s_id,
+                        "Couldn't mount RDWR because of "
+                        "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                goto failed_mount;
@@ -2376,9 +2440,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 */
                if (sizeof(root->i_blocks) < sizeof(u64) &&
                                !(sb->s_flags & MS_RDONLY)) {
-                        printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+                        ext4_msg(sb, KERN_ERR, "Filesystem with huge "
                                        "files cannot be mounted read-write "
-                                        "without CONFIG_LBD.\n", sb->s_id);
+                                        "without CONFIG_LBD");
                        goto failed_mount;
                }
        }
@@ -2386,17 +2450,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
            blocksize > EXT4_MAX_BLOCK_SIZE) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
+                       "Unsupported filesystem blocksize %d", blocksize);
-                       blocksize, sb->s_id);
                goto failed_mount;
        }
        if (sb->s_blocksize != blocksize) {
                /* Validate the filesystem blocksize */
                if (!sb_set_blocksize(sb, blocksize)) {
-                        printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+                        ext4_msg(sb, KERN_ERR, "bad block size %d",
                                        blocksize);
                        goto failed_mount;
                }
@@ -2406,15 +2468,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                offset = do_div(logical_sb_block, blocksize);
                bh = sb_bread(sb, logical_sb_block);
                if (!bh) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: Can't read superblock on 2nd try.\n");
+                               "Can't read superblock on 2nd try");
                        goto failed_mount;
                }
                es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: Magic mismatch, very weird !\n");
+                               "Magic mismatch, very weird!");
                        goto failed_mount;
                }
        }
@@ -2432,30 +2494,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > blocksize)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: unsupported inode size: %d\n",
+                               "unsupported inode size: %d",
                               sbi->s_inode_size);
                        goto failed_mount;
                }
                if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
                        sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
        }
        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                    !is_power_of_2(sbi->s_desc_size)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: unsupported descriptor size %lu\n",
+                               "unsupported descriptor size %lu",
                               sbi->s_desc_size);
                        goto failed_mount;
                }
        } else
                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
        if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0)
                goto cantfind_ext4;
@@ -2466,6 +2531,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_mount_state = le16_to_cpu(es->s_state);
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2483,25 +2549,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (sbi->s_blocks_per_group > blocksize * 8) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                       "EXT4-fs: #blocks per group too big: %lu\n",
+                       "#blocks per group too big: %lu",
                       sbi->s_blocks_per_group);
                goto failed_mount;
        }
        if (sbi->s_inodes_per_group > blocksize * 8) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                       "EXT4-fs: #inodes per group too big: %lu\n",
+                       "#inodes per group too big: %lu",
                       sbi->s_inodes_per_group);
                goto failed_mount;
        }
        if (ext4_blocks_count(es) >
                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-                printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+                ext4_msg(sb, KERN_ERR, "filesystem"
-                        " too large to mount safely\n", sb->s_id);
+                        " too large to mount safely");
                if (sizeof(sector_t) < 8)
-                        printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
+                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
-                                        "enabled\n");
                goto failed_mount;
        }
@@ -2511,21 +2576,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* check blocks count against device size */
        blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
+                ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
-                       "exceeds size of device (%llu blocks)\n",
+                       "exceeds size of device (%llu blocks)",
                       ext4_blocks_count(es), blocks_count);
                goto failed_mount;
        }
-        /*
+        /*
-         * It makes no sense for the first data block to be beyond the end
+         * It makes no sense for the first data block to be beyond the end
-         * of the filesystem.
+         * of the filesystem.
-         */
+         */
-        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
-                       "block %u is beyond end of filesystem (%llu)\n",
+                         "block %u is beyond end of filesystem (%llu)",
-                       le32_to_cpu(es->s_first_data_block),
+                         le32_to_cpu(es->s_first_data_block),
-                       ext4_blocks_count(es));
+                         ext4_blocks_count(es));
                goto failed_mount;
        }
        blocks_count = (ext4_blocks_count(es) -
@@ -2533,9 +2598,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
-                printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+                ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
                       "(block count %llu, first data block %u, "
-                       "blocks per group %lu)\n", sbi->s_groups_count,
+                       "blocks per group %lu)", sbi->s_groups_count,
                       ext4_blocks_count(es),
                       le32_to_cpu(es->s_first_data_block),
                       EXT4_BLOCKS_PER_GROUP(sb));
@@ -2547,7 +2612,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
                                    GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
-                printk(KERN_ERR "EXT4-fs: not enough memory\n");
+                ext4_msg(sb, KERN_ERR, "not enough memory");
                goto failed_mount;
        }
@@ -2562,21 +2627,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                block = descriptor_loc(sb, logical_sb_block, i);
                sbi->s_group_desc[i] = sb_bread(sb, block);
                if (!sbi->s_group_desc[i]) {
-                        printk(KERN_ERR "EXT4-fs: "
+                        ext4_msg(sb, KERN_ERR,
-                               "can't read group descriptor %d\n", i);
+                               "can't read group descriptor %d", i);
                        db_count = i;
                        goto failed_mount2;
                }
        }
        if (!ext4_check_descriptors(sb)) {
-                printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+                ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                if (!ext4_fill_flex_info(sb)) {
-                        printk(KERN_ERR
+                        ext4_msg(sb, KERN_ERR,
-                               "EXT4-fs: unable to initialize "
+                               "unable to initialize "
-                               "flex_bg meta info!\n");
+                               "flex_bg meta info!");
                        goto failed_mount2;
                }
@@ -2598,7 +2663,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
        }
        if (err) {
-                printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount3;
        }
@@ -2607,7 +2672,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /*
         * set up enough so that it can read an inode
         */
-        sb->s_op = &ext4_sops;
+        if (!test_opt(sb, NOLOAD) &&
+            EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+                sb->s_op = &ext4_sops;
+        else
+                sb->s_op = &ext4_nojournal_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
@@ -2615,6 +2684,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->dq_op = &ext4_quota_operations;
 #endif
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+        mutex_init(&sbi->s_orphan_lock);
+        mutex_init(&sbi->s_resize_lock);
        sb->s_root = NULL;
@@ -2632,13 +2703,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount3;
                if (!(sb->s_flags & MS_RDONLY) &&
                    EXT4_SB(sb)->s_journal->j_failed_commit) {
-                        printk(KERN_CRIT "EXT4-fs error (device %s): "
+                        ext4_msg(sb, KERN_CRIT, "error: "
                               "ext4_fill_super: Journal transaction "
-                               "%u is corrupt\n", sb->s_id,
+                               "%u is corrupt",
                               EXT4_SB(sb)->s_journal->j_failed_commit);
                        if (test_opt(sb, ERRORS_RO)) {
-                                printk(KERN_CRIT
+                                ext4_msg(sb, KERN_CRIT,
-                                       "Mounting filesystem read-only\n");
+                                       "Mounting filesystem read-only");
                                sb->s_flags |= MS_RDONLY;
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2646,14 +2717,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        if (test_opt(sb, ERRORS_PANIC)) {
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                                ext4_commit_super(sb, es, 1);
+                                ext4_commit_super(sb, 1);
                                goto failed_mount4;
                        }
                }
        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
-                printk(KERN_ERR "EXT4-fs: required journal recovery "
+                ext4_msg(sb, KERN_ERR, "required journal recovery "
-                       "suppressed and not mounted read-only\n");
+                       "suppressed and not mounted read-only");
                goto failed_mount4;
        } else {
                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2666,7 +2737,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (ext4_blocks_count(es) > 0xffffffffULL &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
-                printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
+                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                goto failed_mount4;
        }
@@ -2704,8 +2775,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        case EXT4_MOUNT_WRITEBACK_DATA:
                if (!jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
-                        printk(KERN_ERR "EXT4-fs: Journal does not support "
+                        ext4_msg(sb, KERN_ERR, "Journal does not support "
-                               "requested data journaling mode\n");
+                               "requested data journaling mode");
                        goto failed_mount4;
                }
        default:
@@ -2717,8 +2788,8 @@ no_journal:
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-                        printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
+                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
-                                "its supported only with writeback mode\n");
+                                "its supported only with writeback mode");
                        clear_opt(sbi->s_mount_opt, NOBH);
                }
        }
@@ -2729,18 +2800,18 @@ no_journal:
        root = ext4_iget(sb, EXT4_ROOT_INO);
        if (IS_ERR(root)) {
-                printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+                ext4_msg(sb, KERN_ERR, "get root inode failed");
                ret = PTR_ERR(root);
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                iput(root);
-                printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+                ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                goto failed_mount4;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
-                printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
+                ext4_msg(sb, KERN_ERR, "get root dentry failed");
                iput(root);
                ret = -ENOMEM;
                goto failed_mount4;
@@ -2769,22 +2840,29 @@ no_journal:
                                                        sbi->s_inode_size) {
                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                                                       EXT4_GOOD_OLD_INODE_SIZE;
-                printk(KERN_INFO "EXT4-fs: required extra inode space not"
+                ext4_msg(sb, KERN_INFO, "required extra inode space not"
-                        "available.\n");
+                         "available");
        }
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-                printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+                ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
-                                "requested data journaling mode\n");
+                         "requested data journaling mode");
                clear_opt(sbi->s_mount_opt, DELALLOC);
        } else if (test_opt(sb, DELALLOC))
-                printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+                ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
+        err = ext4_setup_system_zone(sb);
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "failed to initialize system "
+                         "zone (%d)\n", err);
+                goto failed_mount4;
+        }
        ext4_ext_init(sb);
        err = ext4_mb_init(sb, needs_recovery);
        if (err) {
-                printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+                ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
-                       err);
+                         err);
                goto failed_mount4;
        }
@@ -2798,19 +2876,11 @@ no_journal:
                goto failed_mount4;
        };
-        /*
-         * akpm: core read_super() calls in here with the superblock locked.
-         * That deadlocks, because orphan cleanup needs to lock the superblock
-         * in numerous places.  Here we just pop the lock - it's relatively
-         * harmless, because we are now ready to accept write_super() requests,
-         * and aviro says that's the only reason for hanging onto the
-         * superblock lock.
-         */
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
        if (needs_recovery) {
-                printk(KERN_INFO "EXT4-fs: recovery complete.\n");
+                ext4_msg(sb, KERN_INFO, "recovery complete");
                ext4_mark_recovery_complete(sb, es);
        }
        if (EXT4_SB(sb)->s_journal) {
@@ -2823,25 +2893,30 @@ no_journal:
        } else
                descr = "out journal";
-        printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
+        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
-               sb->s_id, descr);
        lock_kernel();
        return 0;
 cantfind_ext4:
        if (!silent)
-                printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
+                ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
-                       sb->s_id);
        goto failed_mount;
 failed_mount4:
-        printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+        ext4_msg(sb, KERN_ERR, "mount failed");
+        ext4_release_system_zone(sb);
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
 failed_mount3:
+        if (sbi->s_flex_groups) {
+                if (is_vmalloc_addr(sbi->s_flex_groups))
+                        vfree(sbi->s_flex_groups);
+                else
+                        kfree(sbi->s_flex_groups);
+        }
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -2862,6 +2937,7 @@ failed_mount:
        brelse(bh);
 out_fail:
        sb->s_fs_info = NULL;
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
        return ret;
@@ -2906,27 +2982,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
        journal_inode = ext4_iget(sb, journal_inum);
        if (IS_ERR(journal_inode)) {
-                printk(KERN_ERR "EXT4-fs: no journal found.\n");
+                ext4_msg(sb, KERN_ERR, "no journal found");
                return NULL;
        }
        if (!journal_inode->i_nlink) {
                make_bad_inode(journal_inode);
                iput(journal_inode);
-                printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+                ext4_msg(sb, KERN_ERR, "journal inode is deleted");
                return NULL;
        }
        jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
                  journal_inode, journal_inode->i_size);
        if (!S_ISREG(journal_inode->i_mode)) {
-                printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+                ext4_msg(sb, KERN_ERR, "invalid journal inode");
                iput(journal_inode);
                return NULL;
        }
        journal = jbd2_journal_init_inode(journal_inode);
        if (!journal) {
-                printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+                ext4_msg(sb, KERN_ERR, "Could not load journal inode");
                iput(journal_inode);
                return NULL;
        }
@@ -2950,22 +3026,22 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
-        bdev = ext4_blkdev_get(j_dev);
+        bdev = ext4_blkdev_get(j_dev, sb);
        if (bdev == NULL)
                return NULL;
        if (bd_claim(bdev, sb)) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                        "EXT4-fs: failed to claim external journal device.\n");
+                        "failed to claim external journal device");
                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
                return NULL;
        }
        blocksize = sb->s_blocksize;
-        hblock = bdev_hardsect_size(bdev);
+        hblock = bdev_logical_block_size(bdev);
        if (blocksize < hblock) {
-                printk(KERN_ERR
+                ext4_msg(sb, KERN_ERR,
-                        "EXT4-fs: blocksize too small for journal device.\n");
+                        "blocksize too small for journal device");
                goto out_bdev;
        }
@@ -2973,8 +3049,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
        set_blocksize(bdev, blocksize);
        if (!(bh = __bread(bdev, sb_block, blocksize))) {
-                printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
+                ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
-                       "external journal\n");
+                       "external journal");
                goto out_bdev;
        }
@@ -2982,14 +3058,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-                printk(KERN_ERR "EXT4-fs: external journal has "
+                ext4_msg(sb, KERN_ERR, "external journal has "
-                                        "bad superblock\n");
+                                        "bad superblock");
                brelse(bh);
                goto out_bdev;
        }
        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-                printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+                ext4_msg(sb, KERN_ERR, "journal UUID does not match");
                brelse(bh);
                goto out_bdev;
        }
@@ -3001,25 +3077,26 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
                                        start, len, blocksize);
        if (!journal) {
-                printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+                ext4_msg(sb, KERN_ERR, "failed to create device journal");
                goto out_bdev;
        }
        journal->j_private = sb;
        ll_rw_block(READ, 1, &journal->j_sb_buffer);
        wait_on_buffer(journal->j_sb_buffer);
        if (!buffer_uptodate(journal->j_sb_buffer)) {
-                printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+                ext4_msg(sb, KERN_ERR, "I/O error on journal device");
                goto out_journal;
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-                printk(KERN_ERR "EXT4-fs: External journal has more than one "
+                ext4_msg(sb, KERN_ERR, "External journal has more than one "
-                                        "user (unsupported) - %d\n",
+                                        "user (unsupported) - %d",
                        be32_to_cpu(journal->j_superblock->s_nr_users));
                goto out_journal;
        }
        EXT4_SB(sb)->journal_bdev = bdev;
        ext4_init_journal_params(sb, journal);
        return journal;
 out_journal:
        jbd2_journal_destroy(journal);
 out_bdev:
@@ -3041,8 +3118,8 @@ static int ext4_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-                printk(KERN_INFO "EXT4-fs: external journal device major/minor "
+                ext4_msg(sb, KERN_INFO, "external journal device major/minor "
-                        "numbers have changed\n");
+                        "numbers have changed");
                journal_dev = new_decode_dev(journal_devnum);
        } else
                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3054,24 +3131,23 @@ static int ext4_load_journal(struct super_block *sb,
         * crash?  For recovery, we need to check in advance whether we
         * can get read-write access to the device.
         */
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
                if (sb->s_flags & MS_RDONLY) {
-                        printk(KERN_INFO "EXT4-fs: INFO: recovery "
+                        ext4_msg(sb, KERN_INFO, "INFO: recovery "
-                                        "required on readonly filesystem.\n");
+                                        "required on readonly filesystem");
                        if (really_read_only) {
-                                printk(KERN_ERR "EXT4-fs: write access "
+                                ext4_msg(sb, KERN_ERR, "write access "
-                                        "unavailable, cannot proceed.\n");
+                                        "unavailable, cannot proceed");
                                return -EROFS;
                        }
-                        printk(KERN_INFO "EXT4-fs: write access will "
+                        ext4_msg(sb, KERN_INFO, "write access will "
-                               "be enabled during recovery.\n");
+                               "be enabled during recovery");
                }
        }
        if (journal_inum && journal_dev) {
-                printk(KERN_ERR "EXT4-fs: filesystem has both journal "
+                ext4_msg(sb, KERN_ERR, "filesystem has both journal "
-                       "and inode journals!\n");
+                       "and inode journals!");
                return -EINVAL;
        }
@@ -3084,14 +3160,14 @@ static int ext4_load_journal(struct super_block *sb,
        }
        if (journal->j_flags & JBD2_BARRIER)
-                printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+                ext4_msg(sb, KERN_INFO, "barriers enabled");
        else
-                printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+                ext4_msg(sb, KERN_INFO, "barriers disabled");
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
                err = jbd2_journal_update_format(journal);
                if (err)  {
-                        printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+                        ext4_msg(sb, KERN_ERR, "error updating journal");
                        jbd2_journal_destroy(journal);
                        return err;
                }
@@ -3103,7 +3179,7 @@ static int ext4_load_journal(struct super_block *sb,
                err = jbd2_journal_load(journal);
        if (err) {
-                printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+                ext4_msg(sb, KERN_ERR, "error loading journal");
                jbd2_journal_destroy(journal);
                return err;
        }
@@ -3114,18 +3190,17 @@ static int ext4_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
-                sb->s_dirt = 1;
                /* Make sure we flush the recovery flag to disk. */
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
        }
        return 0;
 }
-static int ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb, int sync)
-                              struct ext4_super_block *es, int sync)
 {
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;
@@ -3140,8 +3215,8 @@ static int ext4_commit_super(struct super_block *sb,
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-                printk(KERN_ERR "EXT4-fs: previous I/O error to "
+                ext4_msg(sb, KERN_ERR, "previous I/O error to "
-                       "superblock detected for %s.\n", sb->s_id);
+                       "superblock detected");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
@@ -3154,7 +3229,7 @@ static int ext4_commit_super(struct super_block *sb,
                                        &EXT4_SB(sb)->s_freeblocks_counter));
        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeinodes_counter));
+        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
        if (sync) {
@@ -3164,8 +3239,8 @@ static int ext4_commit_super(struct super_block *sb,
                error = buffer_write_io_error(sbh);
                if (error) {
-                        printk(KERN_ERR "EXT4-fs: I/O error while writing "
+                        ext4_msg(sb, KERN_ERR, "I/O error while writing "
-                               "superblock for %s.\n", sb->s_id);
+                               "superblock");
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }
@@ -3173,7 +3248,6 @@ static int ext4_commit_super(struct super_block *sb,
        return error;
 }
 /*
 * Have we just finished recovery?  If so, and if we are mounting (or
 * remounting) the filesystem readonly, then we will end up with a
@@ -3192,14 +3266,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
        if (jbd2_journal_flush(journal) < 0)
                goto out;
-        lock_super(sb);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-                sb->s_dirt = 0;
+                ext4_commit_super(sb, 1);
-                ext4_commit_super(sb, es, 1);
        }
-        unlock_super(sb);
 out:
        jbd2_journal_unlock_updates(journal);
@@ -3238,7 +3309,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
                jbd2_journal_clear_err(journal);
        }
@@ -3257,29 +3328,17 @@ int ext4_force_commit(struct super_block *sb)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        if (journal) {
+        if (journal)
-                sb->s_dirt = 0;
                ret = ext4_journal_force_commit(journal);
-        }
        return ret;
 }
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
- */
 static void ext4_write_super(struct super_block *sb)
 {
-        if (EXT4_SB(sb)->s_journal) {
+        lock_super(sb);
-                if (mutex_trylock(&sb->s_lock) != 0)
+        ext4_commit_super(sb, 1);
-                        BUG();
+        unlock_super(sb);
-                sb->s_dirt = 0;
-        } else {
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-        }
 }
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3288,16 +3347,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
        tid_t target;
        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-        sb->s_dirt = 0;
+        if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
-        if (EXT4_SB(sb)->s_journal) {
+                if (wait)
-                if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
+                        jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
-                                              &target)) {
-                        if (wait)
-                                jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
-                                                     target);
-                }
-        } else {
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
        }
        return ret;
 }
@@ -3310,34 +3362,32 @@ static int ext4_freeze(struct super_block *sb)
 {
        int error = 0;
        journal_t *journal;
-        sb->s_dirt = 0;
-        if (!(sb->s_flags & MS_RDONLY)) {
+        if (sb->s_flags & MS_RDONLY)
-                journal = EXT4_SB(sb)->s_journal;
+                return 0;
-                if (journal) {
+        journal = EXT4_SB(sb)->s_journal;
-                        /* Now we set up the journal barrier. */
-                        jbd2_journal_lock_updates(journal);
-                        /*
+        /* Now we set up the journal barrier. */
-                         * We don't want to clear needs_recovery flag when we
+        jbd2_journal_lock_updates(journal);
-                         * failed to flush the journal.
-                         */
-                        error = jbd2_journal_flush(journal);
-                        if (error < 0)
-                                goto out;
-                }
-                /* Journal blocked and flushed, clear needs_recovery flag. */
+        /*
-                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+         * Don't clear the needs_recovery flag if we failed to flush
-                error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+         * the journal.
-                if (error)
+         */
-                        goto out;
+        error = jbd2_journal_flush(journal);
+        if (error < 0) {
+        out:
+                jbd2_journal_unlock_updates(journal);
+                return error;
        }
+        /* Journal blocked and flushed, clear needs_recovery flag. */
+        EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        error = ext4_commit_super(sb, 1);
+        if (error)
+                goto out;
        return 0;
-out:
-        jbd2_journal_unlock_updates(journal);
-        return error;
 }
 /*
@@ -3346,14 +3396,15 @@ out:
 */
 static int ext4_unfreeze(struct super_block *sb)
 {
-        if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
+        if (sb->s_flags & MS_RDONLY)
-                lock_super(sb);
+                return 0;
-                /* Reser the needs_recovery flag before the fs is unlocked. */
-                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+        lock_super(sb);
-                ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+        /* Reset the needs_recovery flag before the fs is unlocked. */
-                unlock_super(sb);
+        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+        ext4_commit_super(sb, 1);
-        }
+        unlock_super(sb);
+        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        return 0;
 }
@@ -3371,7 +3422,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        int i;
 #endif
+        lock_kernel();
        /* Store the original options */
+        lock_super(sb);
        old_sb_flags = sb->s_flags;
        old_opts.s_mount_opt = sbi->s_mount_opt;
        old_opts.s_resuid = sbi->s_resuid;
@@ -3432,22 +3486,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                            (sbi->s_mount_state & EXT4_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                        /*
+                        if (sbi->s_journal)
-                         * We have to unlock super so that we can wait for
-                         * transactions.
-                         */
-                        if (sbi->s_journal) {
-                                unlock_super(sb);
                                ext4_mark_recovery_complete(sb, es);
-                                lock_super(sb);
-                        }
                } else {
                        int ret;
                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+                                ext4_msg(sb, KERN_WARNING, "couldn't "
                                       "remount RDWR because of unsupported "
-                                       "optional features (%x).\n", sb->s_id,
+                                       "optional features (%x)",
                                (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                                err = -EROFS;
@@ -3456,17 +3503,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        /*
                         * Make sure the group descriptor checksums
-                         * are sane.  If they aren't, refuse to
+                         * are sane.  If they aren't, refuse to remount r/w.
-                         * remount r/w.
                         */
                        for (g = 0; g < sbi->s_groups_count; g++) {
                                struct ext4_group_desc *gdp =
                                        ext4_get_group_desc(sb, g, NULL);
                                if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
-                                        printk(KERN_ERR
+                                        ext4_msg(sb, KERN_ERR,
-               "EXT4-fs: ext4_remount: "
+               "ext4_remount: Checksum for group %u failed (%u!=%u)",
-                "Checksum for group %u failed (%u!=%u)\n",
                g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EINVAL;
@@ -3480,11 +3525,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * require a full umount/remount for now.
                         */
                        if (es->s_last_orphan) {
-                                printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+                                ext4_msg(sb, KERN_WARNING, "Couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
-                                       "umount/remount instead.\n",
+                                       "umount/remount instead");
-                                       sb->s_id);
                                err = -EINVAL;
                                goto restore_opts;
                        }
@@ -3504,8 +3548,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                sb->s_flags &= ~MS_RDONLY;
                }
        }
+        ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
-                ext4_commit_super(sb, es, 1);
+                ext4_commit_super(sb, 1);
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
@@ -3514,7 +3559,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
                        kfree(old_opts.s_qf_names[i]);
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3532,6 +3580,8 @@ restore_opts:
                sbi->s_qf_names[i] = old_opts.s_qf_names[i];
        }
 #endif
+        unlock_super(sb);
+        unlock_kernel();
        return err;
 }
@@ -3545,9 +3595,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
        } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-                ext4_group_t ngroups = sbi->s_groups_count, i;
+                ext4_group_t i, ngroups = ext4_get_groups_count(sb);
                ext4_fsblk_t overhead = 0;
-                smp_rmb();
                /*
                 * Compute the overhead (FS structures).  This is constant
@@ -3599,11 +3648,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
        return 0;
 }
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
+/* Helper function for writing quotas on sync - we need to start transaction
- * is locked for write. Otherwise the are possible deadlocks:
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
 * Process 1                         Process 2
 * ext4_create()                     quota_sync()
 *   jbd2_journal_start()                  write_dquot()
@@ -3627,7 +3677,7 @@ static int ext4_write_dquot(struct dquot *dquot)
        inode = dquot_to_inode(dquot);
        handle = ext4_journal_start(inode,
-                                        EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+                                    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit(dquot);
@@ -3643,7 +3693,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
        handle_t *handle;
        handle = ext4_journal_start(dquot_to_inode(dquot),
-                                        EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+                                    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_acquire(dquot);
@@ -3659,7 +3709,7 @@ static int ext4_release_dquot(struct dquot *dquot)
        handle_t *handle;
        handle = ext4_journal_start(dquot_to_inode(dquot),
-                                        EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+                                    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle)) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
@@ -3707,7 +3757,7 @@ static int ext4_write_info(struct super_block *sb, int type)
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
        return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-                        EXT4_SB(sb)->s_jquota_fmt, type);
+                                  EXT4_SB(sb)->s_jquota_fmt, type);
 }
 /*
@@ -3738,9 +3788,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
                if (path.dentry->d_parent != sb->s_root)
-                        printk(KERN_WARNING
+                        ext4_msg(sb, KERN_WARNING,
-                                "EXT4-fs: Quota file not on filesystem root. "
+                                "Quota file not on filesystem root. "
-                                "Journaled quota will not work.\n");
+                                "Journaled quota will not work");
        }
        /*
@@ -3823,8 +3873,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        handle_t *handle = journal_current_handle();
        if (EXT4_SB(sb)->s_journal && !handle) {
-                printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
+                ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
-                        " cancelled because transaction is not started.\n",
+                        " cancelled because transaction is not started",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
@@ -3878,10 +3928,10 @@ out:
 #endif
-static int ext4_get_sb(struct file_system_type *fs_type,
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+                       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 static struct file_system_type ext4_fs_type = {
@@ -3893,14 +3943,14 @@ static struct file_system_type ext4_fs_type = {
 };
 #ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type,
+static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+                          const char *dev_name, void *data,struct vfsmount *mnt)
 {
-        printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
+        printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
-               "to mount using ext4\n");
+               "to mount using ext4\n", dev_name);
-        printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
+        printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
-               "will go away by 2.6.31\n");
+               "will go away by 2.6.31\n", dev_name);
-        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 static struct file_system_type ext4dev_fs_type = {
@@ -3917,13 +3967,16 @@ static int __init init_ext4_fs(void)
 {
        int err;
+        err = init_ext4_system_zone();
+        if (err)
+                return err;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-                return -ENOMEM;
+                goto out4;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
        err = init_ext4_mballoc();
        if (err)
-                return err;
+                goto out3;
        err = init_ext4_xattr();
        if (err)
@@ -3948,6 +4001,11 @@ out1:
        exit_ext4_xattr();
 out2:
        exit_ext4_mballoc();
+out3:
+        remove_proc_entry("fs/ext4", NULL);
+        kset_unregister(ext4_kset);
+out4:
+        exit_ext4_system_zone();
        return err;
 }
@@ -3962,6 +4020,7 @@ static void __exit exit_ext4_fs(void)
        exit_ext4_mballoc();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
+        exit_ext4_system_zone();
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index b42602298087..923990e4f16e 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -241,7 +241,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
        while (*fclus < cluster) {
                /* prevent the infinite loop of cluster chain */
                if (*fclus > limit) {
-                        fat_fs_panic(sb, "%s: detected the cluster chain loop"
+                        fat_fs_error(sb, "%s: detected the cluster chain loop"
                                     " (i_pos %lld)", __func__,
                                     MSDOS_I(inode)->i_pos);
                        nr = -EIO;
@@ -252,7 +252,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
                if (nr < 0)
                        goto out;
                else if (nr == FAT_ENT_FREE) {
-                        fat_fs_panic(sb, "%s: invalid cluster chain"
+                        fat_fs_error(sb, "%s: invalid cluster chain"
                                     " (i_pos %lld)", __func__,
                                     MSDOS_I(inode)->i_pos);
                        nr = -EIO;
@@ -285,7 +285,7 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
        if (ret < 0)
                return ret;
        else if (ret == FAT_ENT_EOF) {
-                fat_fs_panic(sb, "%s: request beyond EOF (i_pos %lld)",
+                fat_fs_error(sb, "%s: request beyond EOF (i_pos %lld)",
                             __func__, MSDOS_I(inode)->i_pos);
                return -EIO;
        }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 3a7f603b6982..3b8e71b412fd 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
-        .fsync          = file_fsync,
+        .fsync          = fat_file_fsync,
 };
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
@@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
                        de++;
                        nr_slots--;
                }
-                mark_buffer_dirty(bh);
+                mark_buffer_dirty_inode(bh, dir);
                if (IS_DIRSYNC(dir))
                        err = sync_dirty_buffer(bh);
                brelse(bh);
@@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
                de--;
                nr_slots--;
        }
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        if (IS_DIRSYNC(dir))
                err = sync_dirty_buffer(bh);
        brelse(bh);
@@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
                }
                memset(bhs[n]->b_data, 0, sb->s_blocksize);
                set_buffer_uptodate(bhs[n]);
-                mark_buffer_dirty(bhs[n]);
+                mark_buffer_dirty_inode(bhs[n], dir);
                n++;
                blknr++;
@@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
        de[0].size = de[1].size = 0;
        memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
        set_buffer_uptodate(bhs[0]);
-        mark_buffer_dirty(bhs[0]);
+        mark_buffer_dirty_inode(bhs[0], dir);
        err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
        if (err)
@@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
                        slots += copy;
                        size -= copy;
                        set_buffer_uptodate(bhs[n]);
-                        mark_buffer_dirty(bhs[n]);
+                        mark_buffer_dirty_inode(bhs[n], dir);
                        if (!size)
                                break;
                        n++;
@@ -1293,7 +1293,7 @@ found:
                for (i = 0; i < long_bhs; i++) {
                        int copy = min_t(int, sb->s_blocksize - offset, size);
                        memcpy(bhs[i]->b_data + offset, slots, copy);
-                        mark_buffer_dirty(bhs[i]);
+                        mark_buffer_dirty_inode(bhs[i], dir);
                        offset = 0;
                        slots += copy;
                        size -= copy;
@@ -1304,7 +1304,7 @@ found:
                        /* Fill the short name slot. */
                        int copy = min_t(int, sb->s_blocksize - offset, size);
                        memcpy(bhs[i]->b_data + offset, slots, copy);
-                        mark_buffer_dirty(bhs[i]);
+                        mark_buffer_dirty_inode(bhs[i], dir);
                        if (IS_DIRSYNC(dir))
                                err = sync_dirty_buffer(bhs[i]);
                }
@@ -1334,7 +1334,7 @@ found:
                        goto error_remove;
                }
                if (dir->i_size & (sbi->cluster_size - 1)) {
-                        fat_fs_panic(sb, "Odd directory size");
+                        fat_fs_error(sb, "Odd directory size");
                        dir->i_size = (dir->i_size + sbi->cluster_size - 1)
                                & ~((loff_t)sbi->cluster_size - 1);
                }
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index ea440d65819c..adb0e72a176d 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -17,6 +17,10 @@
 #define VFAT_SFN_CREATE_WIN95   0x0100 /* emulate win95 rule for create */
 #define VFAT_SFN_CREATE_WINNT   0x0200 /* emulate winnt rule for create */
+#define FAT_ERRORS_CONT         1      /* ignore error and continue */
+#define FAT_ERRORS_PANIC        2      /* panic on error */
+#define FAT_ERRORS_RO           3      /* remount r/o on error */
 struct fat_mount_options {
        uid_t fs_uid;
        gid_t fs_gid;
@@ -26,6 +30,7 @@ struct fat_mount_options {
        char *iocharset;          /* Charset used for filename input/display */
        unsigned short shortname; /* flags for shortname display/create rule */
        unsigned char name_check; /* r = relaxed, n = normal, s = strict */
+        unsigned char errors;     /* On error: continue, panic, remount-ro */
        unsigned short allow_utime;/* permission for setting the [am]time */
        unsigned quiet:1,         /* set = fake successful chmods and chowns */
                 showexec:1,      /* set = only set x bit for com/exe/bat */
@@ -74,6 +79,7 @@ struct msdos_sb_info {
        int fatent_shift;
        struct fatent_operations *fatent_ops;
+        struct inode *fat_inode;
        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];
@@ -251,6 +257,7 @@ struct fat_entry {
        } u;
        int nr_bhs;
        struct buffer_head *bhs[2];
+        struct inode *fat_inode;
 };
 static inline void fatent_init(struct fat_entry *fatent)
@@ -259,6 +266,7 @@ static inline void fatent_init(struct fat_entry *fatent)
        fatent->entry = 0;
        fatent->u.ent32_p = NULL;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
+        fatent->fat_inode = NULL;
 }
 static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
@@ -275,6 +283,7 @@ static inline void fatent_brelse(struct fat_entry *fatent)
                brelse(fatent->bhs[i]);
        fatent->nr_bhs = 0;
        fatent->bhs[0] = fatent->bhs[1] = NULL;
+        fatent->fat_inode = NULL;
 }
 extern void fat_ent_access_init(struct super_block *sb);
@@ -296,6 +305,8 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate(struct inode *inode);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
                       struct kstat *stat);
+extern int fat_file_fsync(struct file *file, struct dentry *dentry,
+                          int datasync);
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -310,7 +321,7 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
        __attribute__ ((format (printf, 2, 3))) __cold;
 extern void fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index da6eea47872f..a81037721a6f 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -73,6 +73,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
        struct buffer_head **bhs = fatent->bhs;
        WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
        bhs[0] = sb_bread(sb, blocknr);
        if (!bhs[0])
                goto err;
@@ -103,6 +105,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
        struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
        WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+        fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
        fatent->bhs[0] = sb_bread(sb, blocknr);
        if (!fatent->bhs[0]) {
                printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
@@ -167,9 +170,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
        }
        spin_unlock(&fat12_entry_lock);
-        mark_buffer_dirty(fatent->bhs[0]);
+        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
        if (fatent->nr_bhs == 2)
-                mark_buffer_dirty(fatent->bhs[1]);
+                mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
 }
 static void fat16_ent_put(struct fat_entry *fatent, int new)
@@ -178,7 +181,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
                new = EOF_FAT16;
        *fatent->u.ent16_p = cpu_to_le16(new);
-        mark_buffer_dirty(fatent->bhs[0]);
+        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 static void fat32_ent_put(struct fat_entry *fatent, int new)
@@ -189,7 +192,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new)
        WARN_ON(new & 0xf0000000);
        new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
        *fatent->u.ent32_p = cpu_to_le32(new);
-        mark_buffer_dirty(fatent->bhs[0]);
+        mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 static int fat12_ent_next(struct fat_entry *fatent)
@@ -345,7 +348,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
        if (entry < FAT_START_ENT || sbi->max_cluster <= entry) {
                fatent_brelse(fatent);
-                fat_fs_panic(sb, "invalid access to FAT (entry 0x%08x)", entry);
+                fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
                return -EIO;
        }
@@ -381,7 +384,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
                        }
                        memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
                        set_buffer_uptodate(c_bh);
-                        mark_buffer_dirty(c_bh);
+                        mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
                        if (sb->s_flags & MS_SYNCHRONOUS)
                                err = sync_dirty_buffer(c_bh);
                        brelse(c_bh);
@@ -557,7 +560,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
                        err = cluster;
                        goto error;
                } else if (cluster == FAT_ENT_FREE) {
-                        fat_fs_panic(sb, "%s: deleting FAT entry beyond EOF",
+                        fat_fs_error(sb, "%s: deleting FAT entry beyond EOF",
                                     __func__);
                        err = -EIO;
                        goto error;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 0a7f4a9918b3..b28ea646ff60 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -18,106 +18,112 @@
 #include <linux/security.h>
 #include "fat.h"
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
+static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
-                      unsigned int cmd, unsigned long arg)
+{
+        u32 attr;
+        mutex_lock(&inode->i_mutex);
+        attr = fat_make_attrs(inode);
+        mutex_unlock(&inode->i_mutex);
+        return put_user(attr, user_attr);
+}
+static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 {
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
-        u32 __user *user_attr = (u32 __user *)arg;
+        int is_dir = S_ISDIR(inode->i_mode);
+        u32 attr, oldattr;
+        struct iattr ia;
+        int err;
-        switch (cmd) {
+        err = get_user(attr, user_attr);
-        case FAT_IOCTL_GET_ATTRIBUTES:
+        if (err)
-        {
+                goto out;
-                u32 attr;
-                mutex_lock(&inode->i_mutex);
+        mutex_lock(&inode->i_mutex);
-                attr = fat_make_attrs(inode);
+        err = mnt_want_write(file->f_path.mnt);
-                mutex_unlock(&inode->i_mutex);
+        if (err)
+                goto out_unlock_inode;
-                return put_user(attr, user_attr);
+        /*
+         * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
+         * prevents the user from turning us into a VFAT
+         * longname entry.  Also, we obviously can't set
+         * any of the NTFS attributes in the high 24 bits.
+         */
+        attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
+        /* Merge in ATTR_VOLUME and ATTR_DIR */
+        attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
+                (is_dir ? ATTR_DIR : 0);
+        oldattr = fat_make_attrs(inode);
+        /* Equivalent to a chmod() */
+        ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+        ia.ia_ctime = current_fs_time(inode->i_sb);
+        if (is_dir)
+                ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
+        else {
+                ia.ia_mode = fat_make_mode(sbi, attr,
+                        S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
        }
-        case FAT_IOCTL_SET_ATTRIBUTES:
-        {
-                u32 attr, oldattr;
-                int err, is_dir = S_ISDIR(inode->i_mode);
-                struct iattr ia;
-                err = get_user(attr, user_attr);
+        /* The root directory has no attributes */
-                if (err)
+        if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
-                        return err;
+                err = -EINVAL;
+                goto out_drop_write;
+        }
-                mutex_lock(&inode->i_mutex);
+        if (sbi->options.sys_immutable &&
+            ((attr | oldattr) & ATTR_SYS) &&
-                err = mnt_want_write(filp->f_path.mnt);
+            !capable(CAP_LINUX_IMMUTABLE)) {
-                if (err)
+                err = -EPERM;
-                        goto up_no_drop_write;
+                goto out_drop_write;
+        }
-                /*
-                 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
-                 * prevents the user from turning us into a VFAT
-                 * longname entry.  Also, we obviously can't set
-                 * any of the NTFS attributes in the high 24 bits.
-                 */
-                attr &= 0xff & ~(ATTR_VOLUME | ATTR_DIR);
-                /* Merge in ATTR_VOLUME and ATTR_DIR */
-                attr |= (MSDOS_I(inode)->i_attrs & ATTR_VOLUME) |
-                        (is_dir ? ATTR_DIR : 0);
-                oldattr = fat_make_attrs(inode);
-                /* Equivalent to a chmod() */
-                ia.ia_valid = ATTR_MODE | ATTR_CTIME;
-                ia.ia_ctime = current_fs_time(inode->i_sb);
-                if (is_dir)
-                        ia.ia_mode = fat_make_mode(sbi, attr, S_IRWXUGO);
-                else {
-                        ia.ia_mode = fat_make_mode(sbi, attr,
-                                S_IRUGO | S_IWUGO | (inode->i_mode & S_IXUGO));
-                }
-                /* The root directory has no attributes */
+        /*
-                if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
+         * The security check is questionable...  We single
-                        err = -EINVAL;
+         * out the RO attribute for checking by the security
-                        goto up;
+         * module, just because it maps to a file mode.
-                }
+         */
+        err = security_inode_setattr(file->f_path.dentry, &ia);
+        if (err)
+                goto out_drop_write;
-                if (sbi->options.sys_immutable) {
+        /* This MUST be done before doing anything irreversible... */
-                        if ((attr | oldattr) & ATTR_SYS) {
+        err = fat_setattr(file->f_path.dentry, &ia);
-                                if (!capable(CAP_LINUX_IMMUTABLE)) {
+        if (err)
-                                        err = -EPERM;
+                goto out_drop_write;
-                                        goto up;
-                                }
+        fsnotify_change(file->f_path.dentry, ia.ia_valid);
-                        }
+        if (sbi->options.sys_immutable) {
-                }
+                if (attr & ATTR_SYS)
+                        inode->i_flags |= S_IMMUTABLE;
+                else
+                        inode->i_flags &= S_IMMUTABLE;
+        }
-                /*
+        fat_save_attrs(inode, attr);
-                 * The security check is questionable...  We single
+        mark_inode_dirty(inode);
-                 * out the RO attribute for checking by the security
+out_drop_write:
-                 * module, just because it maps to a file mode.
+        mnt_drop_write(file->f_path.mnt);
-                 */
+out_unlock_inode:
-                err = security_inode_setattr(filp->f_path.dentry, &ia);
+        mutex_unlock(&inode->i_mutex);
-                if (err)
+out:
-                        goto up;
+        return err;
+}
-                /* This MUST be done before doing anything irreversible... */
-                err = fat_setattr(filp->f_path.dentry, &ia);
-                if (err)
-                        goto up;
-                fsnotify_change(filp->f_path.dentry, ia.ia_valid);
-                if (sbi->options.sys_immutable) {
-                        if (attr & ATTR_SYS)
-                                inode->i_flags |= S_IMMUTABLE;
-                        else
-                                inode->i_flags &= S_IMMUTABLE;
-                }
-                fat_save_attrs(inode, attr);
+int fat_generic_ioctl(struct inode *inode, struct file *filp,
-                mark_inode_dirty(inode);
+                      unsigned int cmd, unsigned long arg)
-up:
+{
-                mnt_drop_write(filp->f_path.mnt);
+        u32 __user *user_attr = (u32 __user *)arg;
-up_no_drop_write:
-                mutex_unlock(&inode->i_mutex);
+        switch (cmd) {
-                return err;
+        case FAT_IOCTL_GET_ATTRIBUTES:
-        }
+                return fat_ioctl_get_attributes(inode, user_attr);
+        case FAT_IOCTL_SET_ATTRIBUTES:
+                return fat_ioctl_set_attributes(filp, user_attr);
        default:
                return -ENOTTY; /* Inappropriate ioctl for device */
        }
@@ -133,6 +139,18 @@ static int fat_file_release(struct inode *inode, struct file *filp)
        return 0;
 }
+int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+        struct inode *inode = dentry->d_inode;
+        int res, err;
+        res = simple_fsync(filp, dentry, datasync);
+        err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+        return res ? res : err;
+}
 const struct file_operations fat_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -142,7 +160,7 @@ const struct file_operations fat_file_operations = {
        .mmap           = generic_file_mmap,
        .release        = fat_file_release,
        .ioctl          = fat_generic_ioctl,
-        .fsync          = file_fsync,
+        .fsync          = fat_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -213,7 +231,7 @@ static int fat_free(struct inode *inode, int skip)
                        fatent_brelse(&fatent);
                        return 0;
                } else if (ret == FAT_ENT_FREE) {
-                        fat_fs_panic(sb,
+                        fat_fs_error(sb,
                                     "%s: invalid cluster chain (i_pos %lld)",
                                     __func__, MSDOS_I(inode)->i_pos);
                        ret = -EIO;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 296785a0dec8..304b411cb8bc 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -76,7 +76,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
                return 0;
        if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
-                fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
+                fat_fs_error(sb, "corrupted file size (i_pos %lld, %lld)",
                        MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
                return -EIO;
        }
@@ -441,16 +441,35 @@ static void fat_clear_inode(struct inode *inode)
 static void fat_write_super(struct super_block *sb)
 {
+        lock_super(sb);
        sb->s_dirt = 0;
        if (!(sb->s_flags & MS_RDONLY))
                fat_clusters_flush(sb);
+        unlock_super(sb);
+}
+static int fat_sync_fs(struct super_block *sb, int wait)
+{
+        lock_super(sb);
+        fat_clusters_flush(sb);
+        sb->s_dirt = 0;
+        unlock_super(sb);
+        return 0;
 }
 static void fat_put_super(struct super_block *sb)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                fat_write_super(sb);
+        iput(sbi->fat_inode);
        if (sbi->nls_disk) {
                unload_nls(sbi->nls_disk);
                sbi->nls_disk = NULL;
@@ -467,6 +486,8 @@ static void fat_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
        kfree(sbi);
+        unlock_kernel();
 }
 static struct kmem_cache *fat_inode_cachep;
@@ -632,6 +653,7 @@ static const struct super_operations fat_sops = {
        .delete_inode   = fat_delete_inode,
        .put_super      = fat_put_super,
        .write_super    = fat_write_super,
+        .sync_fs        = fat_sync_fs,
        .statfs         = fat_statfs,
        .clear_inode    = fat_clear_inode,
        .remount_fs     = fat_remount,
@@ -834,6 +856,12 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_puts(m, ",flush");
        if (opts->tz_utc)
                seq_puts(m, ",tz=UTC");
+        if (opts->errors == FAT_ERRORS_CONT)
+                seq_puts(m, ",errors=continue");
+        else if (opts->errors == FAT_ERRORS_PANIC)
+                seq_puts(m, ",errors=panic");
+        else
+                seq_puts(m, ",errors=remount-ro");
        return 0;
 }
@@ -846,7 +874,8 @@ enum {
        Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err,
+        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
+        Opt_err_panic, Opt_err_ro, Opt_err,
 };
 static const match_table_t fat_tokens = {
@@ -869,6 +898,11 @@ static const match_table_t fat_tokens = {
        {Opt_showexec, "showexec"},
        {Opt_debug, "debug"},
        {Opt_immutable, "sys_immutable"},
+        {Opt_flush, "flush"},
+        {Opt_tz_utc, "tz=UTC"},
+        {Opt_err_cont, "errors=continue"},
+        {Opt_err_panic, "errors=panic"},
+        {Opt_err_ro, "errors=remount-ro"},
        {Opt_obsolate, "conv=binary"},
        {Opt_obsolate, "conv=text"},
        {Opt_obsolate, "conv=auto"},
@@ -880,8 +914,6 @@ static const match_table_t fat_tokens = {
        {Opt_obsolate, "cvf_format=%20s"},
        {Opt_obsolate, "cvf_options=%100s"},
        {Opt_obsolate, "posix"},
-        {Opt_flush, "flush"},
-        {Opt_tz_utc, "tz=UTC"},
        {Opt_err, NULL},
 };
 static const match_table_t msdos_tokens = {
@@ -951,6 +983,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
        opts->numtail = 1;
        opts->usefree = opts->nocase = 0;
        opts->tz_utc = 0;
+        opts->errors = FAT_ERRORS_RO;
        *debug = 0;
        if (!options)
@@ -1043,6 +1076,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                case Opt_tz_utc:
                        opts->tz_utc = 1;
                        break;
+                case Opt_err_cont:
+                        opts->errors = FAT_ERRORS_CONT;
+                        break;
+                case Opt_err_panic:
+                        opts->errors = FAT_ERRORS_PANIC;
+                        break;
+                case Opt_err_ro:
+                        opts->errors = FAT_ERRORS_RO;
+                        break;
                /* msdos specific */
                case Opt_dots:
@@ -1174,7 +1216,7 @@ static int fat_read_root(struct inode *inode)
 int fat_fill_super(struct super_block *sb, void *data, int silent,
                   const struct inode_operations *fs_dir_inode_ops, int isvfat)
 {
-        struct inode *root_inode = NULL;
+        struct inode *root_inode = NULL, *fat_inode = NULL;
        struct buffer_head *bh;
        struct fat_boot_sector *b;
        struct msdos_sb_info *sbi;
@@ -1414,6 +1456,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        }
        error = -ENOMEM;
+        fat_inode = new_inode(sb);
+        if (!fat_inode)
+                goto out_fail;
+        MSDOS_I(fat_inode)->i_pos = 0;
+        sbi->fat_inode = fat_inode;
        root_inode = new_inode(sb);
        if (!root_inode)
                goto out_fail;
@@ -1439,6 +1486,8 @@ out_invalid:
                       " on dev %s.\n", sb->s_id);
 out_fail:
+        if (fat_inode)
+                iput(fat_inode);
        if (root_inode)
                iput(root_inode);
        if (sbi->nls_io)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index ac39ebcc1496..a6c20473dfd7 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -12,14 +12,19 @@
 #include "fat.h"
 /*
- * fat_fs_panic reports a severe file system problem and sets the file system
+ * fat_fs_error reports a file system problem that might indicate fa data
- * read-only. The file system can be made writable again by remounting it.
+ * corruption/inconsistency. Depending on 'errors' mount option the
+ * panic() is called, or error message is printed FAT and nothing is done,
+ * or filesystem is remounted read-only (default behavior).
+ * In case the file system is remounted read-only, it can be made writable
+ * again by remounting it.
 */
-void fat_fs_panic(struct super_block *s, const char *fmt, ...)
+void fat_fs_error(struct super_block *s, const char *fmt, ...)
 {
+        struct fat_mount_options *opts = &MSDOS_SB(s)->options;
        va_list args;
-        printk(KERN_ERR "FAT: Filesystem panic (dev %s)\n", s->s_id);
+        printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
        printk(KERN_ERR "    ");
        va_start(args, fmt);
@@ -27,13 +32,14 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...)
        va_end(args);
        printk("\n");
-        if (!(s->s_flags & MS_RDONLY)) {
+        if (opts->errors == FAT_ERRORS_PANIC)
+                panic("    FAT fs panic from previous error\n");
+        else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
                s->s_flags |= MS_RDONLY;
                printk(KERN_ERR "    File system has been set read-only\n");
        }
 }
+EXPORT_SYMBOL_GPL(fat_fs_error);
-EXPORT_SYMBOL_GPL(fat_fs_panic);
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
@@ -124,7 +130,7 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
                        mark_inode_dirty(inode);
        }
        if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
-                fat_fs_panic(sb, "clusters badly computed (%d != %llu)",
+                fat_fs_error(sb, "clusters badly computed (%d != %llu)",
                             new_fclus,
                             (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
                fat_cache_inval_inode(inode);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index da3f361a37dd..82f88733b681 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -544,7 +544,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
                int start = MSDOS_I(new_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                if (IS_DIRSYNC(new_dir)) {
                        err = sync_dirty_buffer(dotdot_bh);
                        if (err)
@@ -586,7 +586,7 @@ error_dotdot:
                int start = MSDOS_I(old_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                corrupt |= sync_dirty_buffer(dotdot_bh);
        }
 error_inode:
@@ -608,7 +608,7 @@ error_inode:
                sinfo.bh = NULL;
        }
        if (corrupt < 0) {
-                fat_fs_panic(new_dir->i_sb,
+                fat_fs_error(new_dir->i_sb,
                             "%s: Filesystem corrupted (i_pos %lld)",
                             __func__, sinfo.i_pos);
        }
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a0e00e3a46e9..8d6fdcfd41df 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -965,7 +965,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
                int start = MSDOS_I(new_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                if (IS_DIRSYNC(new_dir)) {
                        err = sync_dirty_buffer(dotdot_bh);
                        if (err)
@@ -1009,7 +1009,7 @@ error_dotdot:
                int start = MSDOS_I(old_dir)->i_logstart;
                dotdot_de->start = cpu_to_le16(start);
                dotdot_de->starthi = cpu_to_le16(start >> 16);
-                mark_buffer_dirty(dotdot_bh);
+                mark_buffer_dirty_inode(dotdot_bh, old_inode);
                corrupt |= sync_dirty_buffer(dotdot_bh);
        }
 error_inode:
@@ -1030,7 +1030,7 @@ error_inode:
                sinfo.bh = NULL;
        }
        if (corrupt < 0) {
-                fat_fs_panic(new_dir->i_sb,
+                fat_fs_error(new_dir->i_sb,
                             "%s: Filesystem corrupted (i_pos %lld)",
                             __func__, sinfo.i_pos);
        }
diff --git a/fs/file_table.c b/fs/file_table.c
index 54018fe48840..334ce39881f8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
         */
        if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
                file_take_write(file);
-                error = mnt_want_write(mnt);
+                error = mnt_clone_write(mnt);
                WARN_ON(error);
        }
        return error;
@@ -399,6 +399,44 @@ too_bad:
        return 0;
 }
+/**
+ *      mark_files_ro - mark all files read-only
+ *      @sb: superblock in question
+ *
+ *      All files are marked read-only.  We don't care about pending
+ *      delete files so this should be used in 'force' mode only.
+ */
+void mark_files_ro(struct super_block *sb)
+{
+        struct file *f;
+retry:
+        file_list_lock();
+        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+                struct vfsmount *mnt;
+                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+                       continue;
+                if (!file_count(f))
+                        continue;
+                if (!(f->f_mode & FMODE_WRITE))
+                        continue;
+                f->f_mode &= ~FMODE_WRITE;
+                if (file_check_writeable(f) != 0)
+                        continue;
+                file_release_write(f);
+                mnt = mntget(f->f_path.mnt);
+                file_list_unlock();
+                /*
+                 * This can sleep, so we can't hold
+                 * the file_list_lock() spinlock.
+                 */
+                mnt_drop_write(mnt);
+                mntput(mnt);
+                goto retry;
+        }
+        file_list_unlock();
+}
 void __init files_init(unsigned long mempages)
 { 
        int n; 
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1dacda831577..cdbd1654e4cd 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -80,12 +80,16 @@ vxfs_put_super(struct super_block *sbp)
 {
        struct vxfs_sb_info     *infp = VXFS_SBI(sbp);
+        lock_kernel();
        vxfs_put_fake_inode(infp->vsi_fship);
        vxfs_put_fake_inode(infp->vsi_ilist);
        vxfs_put_fake_inode(infp->vsi_stilist);
        brelse(infp->vsi_bp);
        kfree(infp);
+        unlock_kernel();
 }
 /**
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91013ff7dd53..40308e98c6a4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -64,6 +64,28 @@ static void writeback_release(struct backing_dev_info *bdi)
        clear_bit(BDI_pdflush, &bdi->state);
 }
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+                struct dentry *dentry;
+                const char *name = "?";
+                dentry = d_find_alias(inode);
+                if (dentry) {
+                        spin_lock(&dentry->d_lock);
+                        name = (const char *) dentry->d_name.name;
+                }
+                printk(KERN_DEBUG
+                       "%s(%d): dirtied inode %lu (%s) on %s\n",
+                       current->comm, task_pid_nr(current), inode->i_ino,
+                       name, inode->i_sb->s_id);
+                if (dentry) {
+                        spin_unlock(&dentry->d_lock);
+                        dput(dentry);
+                }
+        }
+}
 /**
 *      __mark_inode_dirty -    internal function
 *      @inode: inode to mark
@@ -114,23 +136,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
        if ((inode->i_state & flags) == flags)
                return;
-        if (unlikely(block_dump)) {
+        if (unlikely(block_dump))
-                struct dentry *dentry = NULL;
+                block_dump___mark_inode_dirty(inode);
-                const char *name = "?";
-                if (!list_empty(&inode->i_dentry)) {
-                        dentry = list_entry(inode->i_dentry.next,
-                                            struct dentry, d_alias);
-                        if (dentry && dentry->d_name.name)
-                                name = (const char *) dentry->d_name.name;
-                }
-                if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev"))
-                        printk(KERN_DEBUG
-                               "%s(%d): dirtied inode %lu (%s) on %s\n",
-                               current->comm, task_pid_nr(current), inode->i_ino,
-                               name, inode->i_sb->s_id);
-        }
        spin_lock(&inode_lock);
        if ((inode->i_state & flags) != flags) {
@@ -289,7 +296,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
        int ret;
        BUG_ON(inode->i_state & I_SYNC);
-        WARN_ON(inode->i_state & I_NEW);
        /* Set I_SYNC, reset I_DIRTY */
        dirty = inode->i_state & I_DIRTY;
@@ -314,7 +320,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
        }
        spin_lock(&inode_lock);
-        WARN_ON(inode->i_state & I_NEW);
        inode->i_state &= ~I_SYNC;
        if (!(inode->i_state & I_FREEING)) {
                if (!(inode->i_state & I_DIRTY) &&
@@ -679,55 +684,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 }
 /**
- * sync_inodes - writes all inodes to disk
- * @wait: wait for completion
- *
- * sync_inodes() goes through each super block's dirty inode list, writes the
- * inodes out, waits on the writeout and puts the inodes back on the normal
- * list.
- *
- * This is for sys_sync().  fsync_dev() uses the same algorithm.  The subtle
- * part of the sync functions is that the blockdev "superblock" is processed
- * last.  This is because the write_inode() function of a typical fs will
- * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
- * What we want to do is to perform all that dirtying first, and then write
- * back all those inode blocks via the blockdev mapping in one sweep.  So the
- * additional (somewhat redundant) sync_blockdev() calls here are to make
- * sure that really happens.  Because if we call sync_inodes_sb(wait=1) with
- * outstanding dirty inodes, the writeback goes block-at-a-time within the
- * filesystem's write_inode().  This is extremely slow.
- */
-static void __sync_inodes(int wait)
-{
-        struct super_block *sb;
-        spin_lock(&sb_lock);
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                if (sb->s_root) {
-                        sync_inodes_sb(sb, wait);
-                        sync_blockdev(sb->s_bdev);
-                }
-                up_read(&sb->s_umount);
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-}
-void sync_inodes(int wait)
-{
-        __sync_inodes(0);
-        if (wait)
-                __sync_inodes(1);
-}
-/**
 * write_inode_now      -       write an inode to disk
 * @inode: inode to write to disk
 * @sync: whether the write should be synchronous or not
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 72437065f6ad..e95eeb445e58 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -3,5 +3,6 @@
 #
 obj-$(CONFIG_FUSE_FS) += fuse.o
+obj-$(CONFIG_CUSE) += cuse.o
 fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 000000000000..de792dcf3274
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,610 @@
+/*
+ * CUSE: Character device in Userspace
+ *
+ * Copyright (C) 2008-2009  SUSE Linux Products GmbH
+ * Copyright (C) 2008-2009  Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * CUSE enables character devices to be implemented from userland much
+ * like FUSE allows filesystems.  On initialization /dev/cuse is
+ * created.  By opening the file and replying to the CUSE_INIT request
+ * userland CUSE server can create a character device.  After that the
+ * operation is very similar to FUSE.
+ *
+ * A CUSE instance involves the following objects.
+ *
+ * cuse_conn    : contains fuse_conn and serves as bonding structure
+ * channel      : file handle connected to the userland CUSE server
+ * cdev         : the implemented character device
+ * dev          : generic device for cdev
+ *
+ * Note that 'channel' is what 'dev' is in FUSE.  As CUSE deals with
+ * devices, it's called 'channel' to reduce confusion.
+ *
+ * channel determines when the character device dies.  When channel is
+ * closed, everything begins to destruct.  The cuse_conn is taken off
+ * the lookup table preventing further access from cdev, cdev and
+ * generic device are removed and the base reference of cuse_conn is
+ * put.
+ *
+ * On each open, the matching cuse_conn is looked up and if found an
+ * additional reference is taken which is released when the file is
+ * closed.
+ */
+#include <linux/fuse.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/stat.h>
+#include "fuse_i.h"
+#define CUSE_CONNTBL_LEN        64
+struct cuse_conn {
+        struct list_head        list;   /* linked on cuse_conntbl */
+        struct fuse_conn        fc;     /* fuse connection */
+        struct cdev             *cdev;  /* associated character device */
+        struct device           *dev;   /* device representing @cdev */
+        /* init parameters, set once during initialization */
+        bool                    unrestricted_ioctl;
+};
+static DEFINE_SPINLOCK(cuse_lock);              /* protects cuse_conntbl */
+static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
+static struct class *cuse_class;
+static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
+{
+        return container_of(fc, struct cuse_conn, fc);
+}
+static struct list_head *cuse_conntbl_head(dev_t devt)
+{
+        return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
+}
+/**************************************************************************
+ * CUSE frontend operations
+ *
+ * These are file operations for the character device.
+ *
+ * On open, CUSE opens a file from the FUSE mnt and stores it to
+ * private_data of the open file.  All other ops call FUSE ops on the
+ * FUSE file.
+ */
+static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
+                         loff_t *ppos)
+{
+        loff_t pos = 0;
+        return fuse_direct_io(file, buf, count, &pos, 0);
+}
+static ssize_t cuse_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+        loff_t pos = 0;
+        /*
+         * No locking or generic_write_checks(), the server is
+         * responsible for locking and sanity checks.
+         */
+        return fuse_direct_io(file, buf, count, &pos, 1);
+}
+static int cuse_open(struct inode *inode, struct file *file)
+{
+        dev_t devt = inode->i_cdev->dev;
+        struct cuse_conn *cc = NULL, *pos;
+        int rc;
+        /* look up and get the connection */
+        spin_lock(&cuse_lock);
+        list_for_each_entry(pos, cuse_conntbl_head(devt), list)
+                if (pos->dev->devt == devt) {
+                        fuse_conn_get(&pos->fc);
+                        cc = pos;
+                        break;
+                }
+        spin_unlock(&cuse_lock);
+        /* dead? */
+        if (!cc)
+                return -ENODEV;
+        /*
+         * Generic permission check is already done against the chrdev
+         * file, proceed to open.
+         */
+        rc = fuse_do_open(&cc->fc, 0, file, 0);
+        if (rc)
+                fuse_conn_put(&cc->fc);
+        return rc;
+}
+static int cuse_release(struct inode *inode, struct file *file)
+{
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = ff->fc;
+        fuse_sync_release(ff, file->f_flags);
+        fuse_conn_put(fc);
+        return 0;
+}
+static long cuse_file_ioctl(struct file *file, unsigned int cmd,
+                            unsigned long arg)
+{
+        struct fuse_file *ff = file->private_data;
+        struct cuse_conn *cc = fc_to_cc(ff->fc);
+        unsigned int flags = 0;
+        if (cc->unrestricted_ioctl)
+                flags |= FUSE_IOCTL_UNRESTRICTED;
+        return fuse_do_ioctl(file, cmd, arg, flags);
+}
+static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+                                   unsigned long arg)
+{
+        struct fuse_file *ff = file->private_data;
+        struct cuse_conn *cc = fc_to_cc(ff->fc);
+        unsigned int flags = FUSE_IOCTL_COMPAT;
+        if (cc->unrestricted_ioctl)
+                flags |= FUSE_IOCTL_UNRESTRICTED;
+        return fuse_do_ioctl(file, cmd, arg, flags);
+}
+static const struct file_operations cuse_frontend_fops = {
+        .owner                  = THIS_MODULE,
+        .read                   = cuse_read,
+        .write                  = cuse_write,
+        .open                   = cuse_open,
+        .release                = cuse_release,
+        .unlocked_ioctl         = cuse_file_ioctl,
+        .compat_ioctl           = cuse_file_compat_ioctl,
+        .poll                   = fuse_file_poll,
+};
+/**************************************************************************
+ * CUSE channel initialization and destruction
+ */
+struct cuse_devinfo {
+        const char              *name;
+};
+/**
+ * cuse_parse_one - parse one key=value pair
+ * @pp: i/o parameter for the current position
+ * @end: points to one past the end of the packed string
+ * @keyp: out parameter for key
+ * @valp: out parameter for value
+ *
+ * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
+ * at @end - 1.  This function parses one pair and set *@keyp to the
+ * start of the key and *@valp to the start of the value.  Note that
+ * the original string is modified such that the key string is
+ * terminated with '\0'.  *@pp is updated to point to the next string.
+ *
+ * RETURNS:
+ * 1 on successful parse, 0 on EOF, -errno on failure.
+ */
+static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
+{
+        char *p = *pp;
+        char *key, *val;
+        while (p < end && *p == '\0')
+                p++;
+        if (p == end)
+                return 0;
+        if (end[-1] != '\0') {
+                printk(KERN_ERR "CUSE: info not properly terminated\n");
+                return -EINVAL;
+        }
+        key = val = p;
+        p += strlen(p);
+        if (valp) {
+                strsep(&val, "=");
+                if (!val)
+                        val = key + strlen(key);
+                key = strstrip(key);
+                val = strstrip(val);
+        } else
+                key = strstrip(key);
+        if (!strlen(key)) {
+                printk(KERN_ERR "CUSE: zero length info key specified\n");
+                return -EINVAL;
+        }
+        *pp = p;
+        *keyp = key;
+        if (valp)
+                *valp = val;
+        return 1;
+}
+/**
+ * cuse_parse_dev_info - parse device info
+ * @p: device info string
+ * @len: length of device info string
+ * @devinfo: out parameter for parsed device info
+ *
+ * Parse @p to extract device info and store it into @devinfo.  String
+ * pointed to by @p is modified by parsing and @devinfo points into
+ * them, so @p shouldn't be freed while @devinfo is in use.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
+{
+        char *end = p + len;
+        char *key, *val;
+        int rc;
+        while (true) {
+                rc = cuse_parse_one(&p, end, &key, &val);
+                if (rc < 0)
+                        return rc;
+                if (!rc)
+                        break;
+                if (strcmp(key, "DEVNAME") == 0)
+                        devinfo->name = val;
+                else
+                        printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
+                               key);
+        }
+        if (!devinfo->name || !strlen(devinfo->name)) {
+                printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
+                return -EINVAL;
+        }
+        return 0;
+}
+static void cuse_gendev_release(struct device *dev)
+{
+        kfree(dev);
+}
+/**
+ * cuse_process_init_reply - finish initializing CUSE channel
+ *
+ * This function creates the character device and sets up all the
+ * required data structures for it.  Please read the comment at the
+ * top of this file for high level overview.
+ */
+static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+        struct cuse_conn *cc = fc_to_cc(fc);
+        struct cuse_init_out *arg = &req->misc.cuse_init_out;
+        struct page *page = req->pages[0];
+        struct cuse_devinfo devinfo = { };
+        struct device *dev;
+        struct cdev *cdev;
+        dev_t devt;
+        int rc;
+        if (req->out.h.error ||
+            arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
+                goto err;
+        }
+        fc->minor = arg->minor;
+        fc->max_read = max_t(unsigned, arg->max_read, 4096);
+        fc->max_write = max_t(unsigned, arg->max_write, 4096);
+        /* parse init reply */
+        cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
+        rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
+                                &devinfo);
+        if (rc)
+                goto err;
+        /* determine and reserve devt */
+        devt = MKDEV(arg->dev_major, arg->dev_minor);
+        if (!MAJOR(devt))
+                rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
+        else
+                rc = register_chrdev_region(devt, 1, devinfo.name);
+        if (rc) {
+                printk(KERN_ERR "CUSE: failed to register chrdev region\n");
+                goto err;
+        }
+        /* devt determined, create device */
+        rc = -ENOMEM;
+        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+        if (!dev)
+                goto err_region;
+        device_initialize(dev);
+        dev_set_uevent_suppress(dev, 1);
+        dev->class = cuse_class;
+        dev->devt = devt;
+        dev->release = cuse_gendev_release;
+        dev_set_drvdata(dev, cc);
+        dev_set_name(dev, "%s", devinfo.name);
+        rc = device_add(dev);
+        if (rc)
+                goto err_device;
+        /* register cdev */
+        rc = -ENOMEM;
+        cdev = cdev_alloc();
+        if (!cdev)
+                goto err_device;
+        cdev->owner = THIS_MODULE;
+        cdev->ops = &cuse_frontend_fops;
+        rc = cdev_add(cdev, devt, 1);
+        if (rc)
+                goto err_cdev;
+        cc->dev = dev;
+        cc->cdev = cdev;
+        /* make the device available */
+        spin_lock(&cuse_lock);
+        list_add(&cc->list, cuse_conntbl_head(devt));
+        spin_unlock(&cuse_lock);
+        /* announce device availability */
+        dev_set_uevent_suppress(dev, 0);
+        kobject_uevent(&dev->kobj, KOBJ_ADD);
+out:
+        __free_page(page);
+        return;
+err_cdev:
+        cdev_del(cdev);
+err_device:
+        put_device(dev);
+err_region:
+        unregister_chrdev_region(devt, 1);
+err:
+        fc->conn_error = 1;
+        goto out;
+}
+static int cuse_send_init(struct cuse_conn *cc)
+{
+        int rc;
+        struct fuse_req *req;
+        struct page *page;
+        struct fuse_conn *fc = &cc->fc;
+        struct cuse_init_in *arg;
+        BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
+        req = fuse_get_req(fc);
+        if (IS_ERR(req)) {
+                rc = PTR_ERR(req);
+                goto err;
+        }
+        rc = -ENOMEM;
+        page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        if (!page)
+                goto err_put_req;
+        arg = &req->misc.cuse_init_in;
+        arg->major = FUSE_KERNEL_VERSION;
+        arg->minor = FUSE_KERNEL_MINOR_VERSION;
+        arg->flags |= CUSE_UNRESTRICTED_IOCTL;
+        req->in.h.opcode = CUSE_INIT;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(struct cuse_init_in);
+        req->in.args[0].value = arg;
+        req->out.numargs = 2;
+        req->out.args[0].size = sizeof(struct cuse_init_out);
+        req->out.args[0].value = &req->misc.cuse_init_out;
+        req->out.args[1].size = CUSE_INIT_INFO_MAX;
+        req->out.argvar = 1;
+        req->out.argpages = 1;
+        req->pages[0] = page;
+        req->num_pages = 1;
+        req->end = cuse_process_init_reply;
+        fuse_request_send_background(fc, req);
+        return 0;
+err_put_req:
+        fuse_put_request(fc, req);
+err:
+        return rc;
+}
+static void cuse_fc_release(struct fuse_conn *fc)
+{
+        struct cuse_conn *cc = fc_to_cc(fc);
+        kfree(cc);
+}
+/**
+ * cuse_channel_open - open method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being opened
+ *
+ * Userland CUSE server can create a CUSE device by opening /dev/cuse
+ * and replying to the initilaization request kernel sends.  This
+ * function is responsible for handling CUSE device initialization.
+ * Because the fd opened by this function is used during
+ * initialization, this function only creates cuse_conn and sends
+ * init.  The rest is delegated to a kthread.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_open(struct inode *inode, struct file *file)
+{
+        struct cuse_conn *cc;
+        int rc;
+        /* set up cuse_conn */
+        cc = kzalloc(sizeof(*cc), GFP_KERNEL);
+        if (!cc)
+                return -ENOMEM;
+        fuse_conn_init(&cc->fc);
+        INIT_LIST_HEAD(&cc->list);
+        cc->fc.release = cuse_fc_release;
+        cc->fc.connected = 1;
+        cc->fc.blocked = 0;
+        rc = cuse_send_init(cc);
+        if (rc) {
+                fuse_conn_put(&cc->fc);
+                return rc;
+        }
+        file->private_data = &cc->fc;   /* channel owns base reference to cc */
+        return 0;
+}
+/**
+ * cuse_channel_release - release method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being closed
+ *
+ * Disconnect the channel, deregister CUSE device and initiate
+ * destruction by putting the default reference.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_release(struct inode *inode, struct file *file)
+{
+        struct cuse_conn *cc = fc_to_cc(file->private_data);
+        int rc;
+        /* remove from the conntbl, no more access from this point on */
+        spin_lock(&cuse_lock);
+        list_del_init(&cc->list);
+        spin_unlock(&cuse_lock);
+        /* remove device */
+        if (cc->dev)
+                device_unregister(cc->dev);
+        if (cc->cdev) {
+                unregister_chrdev_region(cc->cdev->dev, 1);
+                cdev_del(cc->cdev);
+        }
+        /* kill connection and shutdown channel */
+        fuse_conn_kill(&cc->fc);
+        rc = fuse_dev_release(inode, file);     /* puts the base reference */
+        return rc;
+}
+static struct file_operations cuse_channel_fops; /* initialized during init */
+/**************************************************************************
+ * Misc stuff and module initializatiion
+ *
+ * CUSE exports the same set of attributes to sysfs as fusectl.
+ */
+static ssize_t cuse_class_waiting_show(struct device *dev,
+                                       struct device_attribute *attr, char *buf)
+{
+        struct cuse_conn *cc = dev_get_drvdata(dev);
+        return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+}
+static ssize_t cuse_class_abort_store(struct device *dev,
+                                      struct device_attribute *attr,
+                                      const char *buf, size_t count)
+{
+        struct cuse_conn *cc = dev_get_drvdata(dev);
+        fuse_abort_conn(&cc->fc);
+        return count;
+}
+static struct device_attribute cuse_class_dev_attrs[] = {
+        __ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL),
+        __ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store),
+        { }
+};
+static struct miscdevice cuse_miscdev = {
+        .minor          = MISC_DYNAMIC_MINOR,
+        .name           = "cuse",
+        .fops           = &cuse_channel_fops,
+};
+static int __init cuse_init(void)
+{
+        int i, rc;
+        /* init conntbl */
+        for (i = 0; i < CUSE_CONNTBL_LEN; i++)
+                INIT_LIST_HEAD(&cuse_conntbl[i]);
+        /* inherit and extend fuse_dev_operations */
+        cuse_channel_fops               = fuse_dev_operations;
+        cuse_channel_fops.owner         = THIS_MODULE;
+        cuse_channel_fops.open          = cuse_channel_open;
+        cuse_channel_fops.release       = cuse_channel_release;
+        cuse_class = class_create(THIS_MODULE, "cuse");
+        if (IS_ERR(cuse_class))
+                return PTR_ERR(cuse_class);
+        cuse_class->dev_attrs = cuse_class_dev_attrs;
+        rc = misc_register(&cuse_miscdev);
+        if (rc) {
+                class_destroy(cuse_class);
+                return rc;
+        }
+        return 0;
+}
+static void __exit cuse_exit(void)
+{
+        misc_deregister(&cuse_miscdev);
+        class_destroy(cuse_class);
+}
+module_init(cuse_init);
+module_exit(cuse_exit);
+MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
+MODULE_DESCRIPTION("Character device in Userspace");
+MODULE_LICENSE("GPL");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ba76b68c52ff..8fed2ed12f38 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -46,6 +46,7 @@ struct fuse_req *fuse_request_alloc(void)
                fuse_request_init(req);
        return req;
 }
+EXPORT_SYMBOL_GPL(fuse_request_alloc);
 struct fuse_req *fuse_request_alloc_nofs(void)
 {
@@ -124,6 +125,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
        atomic_dec(&fc->num_waiting);
        return ERR_PTR(err);
 }
+EXPORT_SYMBOL_GPL(fuse_get_req);
 /*
 * Return request in fuse_file->reserved_req.  However that may
@@ -208,6 +210,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
                        fuse_request_free(req);
        }
 }
+EXPORT_SYMBOL_GPL(fuse_put_request);
 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
 {
@@ -282,7 +285,7 @@ __releases(&fc->lock)
                        wake_up_all(&fc->blocked_waitq);
                }
                if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
-                    fc->connected) {
+                    fc->connected && fc->bdi_initialized) {
                        clear_bdi_congested(&fc->bdi, READ);
                        clear_bdi_congested(&fc->bdi, WRITE);
                }
@@ -400,6 +403,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
        }
        spin_unlock(&fc->lock);
 }
+EXPORT_SYMBOL_GPL(fuse_request_send);
 static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
                                            struct fuse_req *req)
@@ -408,7 +412,8 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
        fc->num_background++;
        if (fc->num_background == FUSE_MAX_BACKGROUND)
                fc->blocked = 1;
-        if (fc->num_background == FUSE_CONGESTION_THRESHOLD) {
+        if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+            fc->bdi_initialized) {
                set_bdi_congested(&fc->bdi, READ);
                set_bdi_congested(&fc->bdi, WRITE);
        }
@@ -439,6 +444,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
        req->isreply = 1;
        fuse_request_send_nowait(fc, req);
 }
+EXPORT_SYMBOL_GPL(fuse_request_send_background);
 /*
 * Called under fc->lock
@@ -1105,8 +1111,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
        }
        spin_unlock(&fc->lock);
 }
+EXPORT_SYMBOL_GPL(fuse_abort_conn);
-static int fuse_dev_release(struct inode *inode, struct file *file)
+int fuse_dev_release(struct inode *inode, struct file *file)
 {
        struct fuse_conn *fc = fuse_get_conn(file);
        if (fc) {
@@ -1120,6 +1127,7 @@ static int fuse_dev_release(struct inode *inode, struct file *file)
        return 0;
 }
+EXPORT_SYMBOL_GPL(fuse_dev_release);
 static int fuse_dev_fasync(int fd, struct file *file, int on)
 {
@@ -1142,6 +1150,7 @@ const struct file_operations fuse_dev_operations = {
        .release        = fuse_dev_release,
        .fasync         = fuse_dev_fasync,
 };
+EXPORT_SYMBOL_GPL(fuse_dev_operations);
 static struct miscdevice fuse_miscdevice = {
        .minor = FUSE_MINOR,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8b8eebc5614b..b3089a083d30 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -362,19 +362,6 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 }
 /*
- * Synchronous release for the case when something goes wrong in CREATE_OPEN
- */
-static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
-                              u64 nodeid, int flags)
-{
-        fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
-        ff->reserved_req->force = 1;
-        fuse_request_send(fc, ff->reserved_req);
-        fuse_put_request(fc, ff->reserved_req);
-        kfree(ff);
-}
-/*
 * Atomic create+open operation
 *
 * If the filesystem doesn't support this, then fall back to separate
@@ -445,12 +432,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
                goto out_free_ff;
        fuse_put_request(fc, req);
+        ff->fh = outopen.fh;
+        ff->nodeid = outentry.nodeid;
+        ff->open_flags = outopen.open_flags;
        inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
                          &outentry.attr, entry_attr_timeout(&outentry), 0);
        if (!inode) {
                flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
-                ff->fh = outopen.fh;
+                fuse_sync_release(ff, flags);
-                fuse_sync_release(fc, ff, outentry.nodeid, flags);
                fuse_send_forget(fc, forget_req, outentry.nodeid, 1);
                return -ENOMEM;
        }
@@ -460,11 +449,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
        fuse_invalidate_attr(dir);
        file = lookup_instantiate_filp(nd, entry, generic_file_open);
        if (IS_ERR(file)) {
-                ff->fh = outopen.fh;
+                fuse_sync_release(ff, flags);
-                fuse_sync_release(fc, ff, outentry.nodeid, flags);
                return PTR_ERR(file);
        }
-        fuse_finish_open(inode, file, ff, &outopen);
+        file->private_data = fuse_file_get(ff);
+        fuse_finish_open(inode, file);
        return 0;
 out_free_ff:
@@ -1035,7 +1024,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
-        fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+        fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
        fuse_request_send(fc, req);
        nbytes = req->out.args[0].size;
        err = req->out.h.error;
@@ -1101,12 +1090,14 @@ static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
 static int fuse_dir_open(struct inode *inode, struct file *file)
 {
-        return fuse_open_common(inode, file, 1);
+        return fuse_open_common(inode, file, true);
 }
 static int fuse_dir_release(struct inode *inode, struct file *file)
 {
-        return fuse_release_common(inode, file, 1);
+        fuse_release_common(file, FUSE_RELEASEDIR);
+        return 0;
 }
 static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 06f30e965676..fce6ce694fde 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -12,13 +12,13 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/module.h>
 static const struct file_operations fuse_direct_io_file_operations;
-static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
+static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
-                          struct fuse_open_out *outargp)
+                          int opcode, struct fuse_open_out *outargp)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_open_in inarg;
        struct fuse_req *req;
        int err;
@@ -31,8 +31,8 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
        inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
        if (!fc->atomic_o_trunc)
                inarg.flags &= ~O_TRUNC;
-        req->in.h.opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+        req->in.h.opcode = opcode;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -49,22 +49,27 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
        struct fuse_file *ff;
        ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
-        if (ff) {
+        if (unlikely(!ff))
-                ff->reserved_req = fuse_request_alloc();
+                return NULL;
-                if (!ff->reserved_req) {
-                        kfree(ff);
+        ff->fc = fc;
-                        return NULL;
+        ff->reserved_req = fuse_request_alloc();
-                } else {
+        if (unlikely(!ff->reserved_req)) {
-                        INIT_LIST_HEAD(&ff->write_entry);
+                kfree(ff);
-                        atomic_set(&ff->count, 0);
+                return NULL;
-                        spin_lock(&fc->lock);
-                        ff->kh = ++fc->khctr;
-                        spin_unlock(&fc->lock);
-                }
-                RB_CLEAR_NODE(&ff->polled_node);
-                init_waitqueue_head(&ff->poll_wait);
        }
+        INIT_LIST_HEAD(&ff->write_entry);
+        atomic_set(&ff->count, 0);
+        RB_CLEAR_NODE(&ff->polled_node);
+        init_waitqueue_head(&ff->poll_wait);
+        spin_lock(&fc->lock);
+        ff->kh = ++fc->khctr;
+        spin_unlock(&fc->lock);
        return ff;
 }
@@ -74,7 +79,7 @@ void fuse_file_free(struct fuse_file *ff)
        kfree(ff);
 }
-static struct fuse_file *fuse_file_get(struct fuse_file *ff)
+struct fuse_file *fuse_file_get(struct fuse_file *ff)
 {
        atomic_inc(&ff->count);
        return ff;
@@ -82,40 +87,65 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff)
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-        dput(req->misc.release.dentry);
+        path_put(&req->misc.release.path);
-        mntput(req->misc.release.vfsmount);
 }
 static void fuse_file_put(struct fuse_file *ff)
 {
        if (atomic_dec_and_test(&ff->count)) {
                struct fuse_req *req = ff->reserved_req;
-                struct inode *inode = req->misc.release.dentry->d_inode;
-                struct fuse_conn *fc = get_fuse_conn(inode);
                req->end = fuse_release_end;
-                fuse_request_send_background(fc, req);
+                fuse_request_send_background(ff->fc, req);
                kfree(ff);
        }
 }
-void fuse_finish_open(struct inode *inode, struct file *file,
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
-                      struct fuse_file *ff, struct fuse_open_out *outarg)
+                 bool isdir)
 {
-        if (outarg->open_flags & FOPEN_DIRECT_IO)
+        struct fuse_open_out outarg;
+        struct fuse_file *ff;
+        int err;
+        int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+        ff = fuse_file_alloc(fc);
+        if (!ff)
+                return -ENOMEM;
+        err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+        if (err) {
+                fuse_file_free(ff);
+                return err;
+        }
+        if (isdir)
+                outarg.open_flags &= ~FOPEN_DIRECT_IO;
+        ff->fh = outarg.fh;
+        ff->nodeid = nodeid;
+        ff->open_flags = outarg.open_flags;
+        file->private_data = fuse_file_get(ff);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_do_open);
+void fuse_finish_open(struct inode *inode, struct file *file)
+{
+        struct fuse_file *ff = file->private_data;
+        if (ff->open_flags & FOPEN_DIRECT_IO)
                file->f_op = &fuse_direct_io_file_operations;
-        if (!(outarg->open_flags & FOPEN_KEEP_CACHE))
+        if (!(ff->open_flags & FOPEN_KEEP_CACHE))
                invalidate_inode_pages2(inode->i_mapping);
-        if (outarg->open_flags & FOPEN_NONSEEKABLE)
+        if (ff->open_flags & FOPEN_NONSEEKABLE)
                nonseekable_open(inode, file);
-        ff->fh = outarg->fh;
-        file->private_data = fuse_file_get(ff);
 }
-int fuse_open_common(struct inode *inode, struct file *file, int isdir)
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 {
        struct fuse_conn *fc = get_fuse_conn(inode);
-        struct fuse_open_out outarg;
-        struct fuse_file *ff;
        int err;
        /* VFS checks this, but only _after_ ->open() */
@@ -126,78 +156,85 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
        if (err)
                return err;
-        ff = fuse_file_alloc(fc);
+        err = fuse_do_open(fc, get_node_id(inode), file, isdir);
-        if (!ff)
-                return -ENOMEM;
-        err = fuse_send_open(inode, file, isdir, &outarg);
        if (err)
-                fuse_file_free(ff);
+                return err;
-        else {
-                if (isdir)
-                        outarg.open_flags &= ~FOPEN_DIRECT_IO;
-                fuse_finish_open(inode, file, ff, &outarg);
-        }
-        return err;
+        fuse_finish_open(inode, file);
+        return 0;
 }
-void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode)
+static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
 {
+        struct fuse_conn *fc = ff->fc;
        struct fuse_req *req = ff->reserved_req;
        struct fuse_release_in *inarg = &req->misc.release.in;
+        spin_lock(&fc->lock);
+        list_del(&ff->write_entry);
+        if (!RB_EMPTY_NODE(&ff->polled_node))
+                rb_erase(&ff->polled_node, &fc->polled_files);
+        spin_unlock(&fc->lock);
+        wake_up_interruptible_sync(&ff->poll_wait);
        inarg->fh = ff->fh;
        inarg->flags = flags;
        req->in.h.opcode = opcode;
-        req->in.h.nodeid = nodeid;
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_release_in);
        req->in.args[0].value = inarg;
 }
-int fuse_release_common(struct inode *inode, struct file *file, int isdir)
+void fuse_release_common(struct file *file, int opcode)
 {
-        struct fuse_file *ff = file->private_data;
+        struct fuse_file *ff;
-        if (ff) {
+        struct fuse_req *req;
-                struct fuse_conn *fc = get_fuse_conn(inode);
-                struct fuse_req *req = ff->reserved_req;
-                fuse_release_fill(ff, get_node_id(inode), file->f_flags,
-                                  isdir ? FUSE_RELEASEDIR : FUSE_RELEASE);
-                /* Hold vfsmount and dentry until release is finished */
+        ff = file->private_data;
-                req->misc.release.vfsmount = mntget(file->f_path.mnt);
+        if (unlikely(!ff))
-                req->misc.release.dentry = dget(file->f_path.dentry);
+                return;
-                spin_lock(&fc->lock);
+        req = ff->reserved_req;
-                list_del(&ff->write_entry);
+        fuse_prepare_release(ff, file->f_flags, opcode);
-                if (!RB_EMPTY_NODE(&ff->polled_node))
-                        rb_erase(&ff->polled_node, &fc->polled_files);
-                spin_unlock(&fc->lock);
-                wake_up_interruptible_sync(&ff->poll_wait);
+        /* Hold vfsmount and dentry until release is finished */
-                /*
+        path_get(&file->f_path);
-                 * Normally this will send the RELEASE request,
+        req->misc.release.path = file->f_path;
-                 * however if some asynchronous READ or WRITE requests
-                 * are outstanding, the sending will be delayed
-                 */
-                fuse_file_put(ff);
-        }
-        /* Return value is ignored by VFS */
+        /*
-        return 0;
+         * Normally this will send the RELEASE request, however if
+         * some asynchronous READ or WRITE requests are outstanding,
+         * the sending will be delayed.
+         */
+        fuse_file_put(ff);
 }
 static int fuse_open(struct inode *inode, struct file *file)
 {
-        return fuse_open_common(inode, file, 0);
+        return fuse_open_common(inode, file, false);
 }
 static int fuse_release(struct inode *inode, struct file *file)
 {
-        return fuse_release_common(inode, file, 0);
+        fuse_release_common(file, FUSE_RELEASE);
+        /* return value is ignored by VFS */
+        return 0;
+}
+void fuse_sync_release(struct fuse_file *ff, int flags)
+{
+        WARN_ON(atomic_read(&ff->count) > 1);
+        fuse_prepare_release(ff, flags, FUSE_RELEASE);
+        ff->reserved_req->force = 1;
+        fuse_request_send(ff->fc, ff->reserved_req);
+        fuse_put_request(ff->fc, ff->reserved_req);
+        kfree(ff);
 }
+EXPORT_SYMBOL_GPL(fuse_sync_release);
 /*
 * Scramble the ID space with XTEA, so that the value of the files_struct
@@ -371,8 +408,8 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
        return fuse_fsync_common(file, de, datasync, 0);
 }
-void fuse_read_fill(struct fuse_req *req, struct file *file,
+void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
-                    struct inode *inode, loff_t pos, size_t count, int opcode)
+                    size_t count, int opcode)
 {
        struct fuse_read_in *inarg = &req->misc.read.in;
        struct fuse_file *ff = file->private_data;
@@ -382,7 +419,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
        inarg->size = count;
        inarg->flags = file->f_flags;
        req->in.h.opcode = opcode;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(struct fuse_read_in);
        req->in.args[0].value = inarg;
@@ -392,12 +429,12 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
 }
 static size_t fuse_send_read(struct fuse_req *req, struct file *file,
-                             struct inode *inode, loff_t pos, size_t count,
+                             loff_t pos, size_t count, fl_owner_t owner)
-                             fl_owner_t owner)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = ff->fc;
-        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
+        fuse_read_fill(req, file, pos, count, FUSE_READ);
        if (owner != NULL) {
                struct fuse_read_in *inarg = &req->misc.read.in;
@@ -455,7 +492,7 @@ static int fuse_readpage(struct file *file, struct page *page)
        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
-        num_read = fuse_send_read(req, file, inode, pos, count, NULL);
+        num_read = fuse_send_read(req, file, pos, count, NULL);
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -504,19 +541,18 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                fuse_file_put(req->ff);
 }
-static void fuse_send_readpages(struct fuse_req *req, struct file *file,
+static void fuse_send_readpages(struct fuse_req *req, struct file *file)
-                                struct inode *inode)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
+        struct fuse_conn *fc = ff->fc;
        loff_t pos = page_offset(req->pages[0]);
        size_t count = req->num_pages << PAGE_CACHE_SHIFT;
        req->out.argpages = 1;
        req->out.page_zeroing = 1;
-        fuse_read_fill(req, file, inode, pos, count, FUSE_READ);
+        fuse_read_fill(req, file, pos, count, FUSE_READ);
        req->misc.read.attr_ver = fuse_get_attr_version(fc);
        if (fc->async_read) {
-                struct fuse_file *ff = file->private_data;
                req->ff = fuse_file_get(ff);
                req->end = fuse_readpages_end;
                fuse_request_send_background(fc, req);
@@ -546,7 +582,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
            (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
             (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
             req->pages[req->num_pages - 1]->index + 1 != page->index)) {
-                fuse_send_readpages(req, data->file, inode);
+                fuse_send_readpages(req, data->file);
                data->req = req = fuse_get_req(fc);
                if (IS_ERR(req)) {
                        unlock_page(page);
@@ -580,7 +616,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
        err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
        if (!err) {
                if (data.req->num_pages)
-                        fuse_send_readpages(data.req, file, inode);
+                        fuse_send_readpages(data.req, file);
                else
                        fuse_put_request(fc, data.req);
        }
@@ -607,24 +643,19 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        return generic_file_aio_read(iocb, iov, nr_segs, pos);
 }
-static void fuse_write_fill(struct fuse_req *req, struct file *file,
+static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
-                            struct fuse_file *ff, struct inode *inode,
+                            loff_t pos, size_t count)
-                            loff_t pos, size_t count, int writepage)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_write_in *inarg = &req->misc.write.in;
        struct fuse_write_out *outarg = &req->misc.write.out;
-        memset(inarg, 0, sizeof(struct fuse_write_in));
        inarg->fh = ff->fh;
        inarg->offset = pos;
        inarg->size = count;
-        inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0;
-        inarg->flags = file ? file->f_flags : 0;
        req->in.h.opcode = FUSE_WRITE;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 2;
-        if (fc->minor < 9)
+        if (ff->fc->minor < 9)
                req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
        else
                req->in.args[0].size = sizeof(struct fuse_write_in);
@@ -636,13 +667,15 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file,
 }
 static size_t fuse_send_write(struct fuse_req *req, struct file *file,
-                              struct inode *inode, loff_t pos, size_t count,
+                              loff_t pos, size_t count, fl_owner_t owner)
-                              fl_owner_t owner)
 {
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_file *ff = file->private_data;
-        fuse_write_fill(req, file, file->private_data, inode, pos, count, 0);
+        struct fuse_conn *fc = ff->fc;
+        struct fuse_write_in *inarg = &req->misc.write.in;
+        fuse_write_fill(req, ff, pos, count);
+        inarg->flags = file->f_flags;
        if (owner != NULL) {
-                struct fuse_write_in *inarg = &req->misc.write.in;
                inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
                inarg->lock_owner = fuse_lock_owner_id(fc, owner);
        }
@@ -700,7 +733,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode,
        req->num_pages = 1;
        req->pages[0] = page;
        req->page_offset = offset;
-        nres = fuse_send_write(req, file, inode, pos, count, NULL);
+        nres = fuse_send_write(req, file, pos, count, NULL);
        err = req->out.h.error;
        fuse_put_request(fc, req);
        if (!err && !nres)
@@ -741,7 +774,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
        for (i = 0; i < req->num_pages; i++)
                fuse_wait_on_page_writeback(inode, req->pages[i]->index);
-        res = fuse_send_write(req, file, inode, pos, count, NULL);
+        res = fuse_send_write(req, file, pos, count, NULL);
        offset = req->page_offset;
        count = res;
@@ -979,25 +1012,23 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
        return 0;
 }
-static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
-                              size_t count, loff_t *ppos, int write)
+                       size_t count, loff_t *ppos, int write)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
+        struct fuse_file *ff = file->private_data;
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_conn *fc = ff->fc;
        size_t nmax = write ? fc->max_write : fc->max_read;
        loff_t pos = *ppos;
        ssize_t res = 0;
        struct fuse_req *req;
-        if (is_bad_inode(inode))
-                return -EIO;
        req = fuse_get_req(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
        while (count) {
                size_t nres;
+                fl_owner_t owner = current->files;
                size_t nbytes = min(count, nmax);
                int err = fuse_get_user_pages(req, buf, &nbytes, write);
                if (err) {
@@ -1006,11 +1037,10 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
                }
                if (write)
-                        nres = fuse_send_write(req, file, inode, pos, nbytes,
+                        nres = fuse_send_write(req, file, pos, nbytes, owner);
-                                               current->files);
                else
-                        nres = fuse_send_read(req, file, inode, pos, nbytes,
+                        nres = fuse_send_read(req, file, pos, nbytes, owner);
-                                              current->files);
                fuse_release_user_pages(req, !write);
                if (req->out.h.error) {
                        if (!res)
@@ -1034,20 +1064,27 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
                }
        }
        fuse_put_request(fc, req);
-        if (res > 0) {
+        if (res > 0)
-                if (write)
-                        fuse_write_update_size(inode, pos);
                *ppos = pos;
-        }
-        fuse_invalidate_attr(inode);
        return res;
 }
+EXPORT_SYMBOL_GPL(fuse_direct_io);
 static ssize_t fuse_direct_read(struct file *file, char __user *buf,
                                     size_t count, loff_t *ppos)
 {
-        return fuse_direct_io(file, buf, count, ppos, 0);
+        ssize_t res;
+        struct inode *inode = file->f_path.dentry->d_inode;
+        if (is_bad_inode(inode))
+                return -EIO;
+        res = fuse_direct_io(file, buf, count, ppos, 0);
+        fuse_invalidate_attr(inode);
+        return res;
 }
 static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
@@ -1055,12 +1092,22 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        ssize_t res;
+        if (is_bad_inode(inode))
+                return -EIO;
        /* Don't allow parallel writes to the same file */
        mutex_lock(&inode->i_mutex);
        res = generic_write_checks(file, ppos, &count, 0);
-        if (!res)
+        if (!res) {
                res = fuse_direct_io(file, buf, count, ppos, 1);
+                if (res > 0)
+                        fuse_write_update_size(inode, *ppos);
+        }
        mutex_unlock(&inode->i_mutex);
+        fuse_invalidate_attr(inode);
        return res;
 }
@@ -1177,9 +1224,10 @@ static int fuse_writepage_locked(struct page *page)
        req->ff = fuse_file_get(ff);
        spin_unlock(&fc->lock);
-        fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1);
+        fuse_write_fill(req, ff, page_offset(page), 0);
        copy_highpage(tmp_page, page);
+        req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
        req->in.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = tmp_page;
@@ -1603,12 +1651,11 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 * limits ioctl data transfers to well-formed ioctls and is the forced
 * behavior for all FUSE servers.
 */
-static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
-                               unsigned long arg, unsigned int flags)
+                   unsigned int flags)
 {
-        struct inode *inode = file->f_dentry->d_inode;
        struct fuse_file *ff = file->private_data;
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_conn *fc = ff->fc;
        struct fuse_ioctl_in inarg = {
                .fh = ff->fh,
                .cmd = cmd,
@@ -1627,13 +1674,6 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
        /* assume all the iovs returned by client always fits in a page */
        BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
-        if (!fuse_allow_task(fc, current))
-                return -EACCES;
-        err = -EIO;
-        if (is_bad_inode(inode))
-                goto out;
        err = -ENOMEM;
        pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
        iov_page = alloc_page(GFP_KERNEL);
@@ -1694,7 +1734,7 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
        /* okay, let's send it to the client */
        req->in.h.opcode = FUSE_IOCTL;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1777,17 +1817,33 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
        return err ? err : outarg.result;
 }
+EXPORT_SYMBOL_GPL(fuse_do_ioctl);
+static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
+                                   unsigned long arg, unsigned int flags)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        if (!fuse_allow_task(fc, current))
+                return -EACCES;
+        if (is_bad_inode(inode))
+                return -EIO;
+        return fuse_do_ioctl(file, cmd, arg, flags);
+}
 static long fuse_file_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg)
 {
-        return fuse_file_do_ioctl(file, cmd, arg, 0);
+        return fuse_file_ioctl_common(file, cmd, arg, 0);
 }
 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
                                   unsigned long arg)
 {
-        return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
+        return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
 }
 /*
@@ -1841,11 +1897,10 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
        spin_unlock(&fc->lock);
 }
-static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+unsigned fuse_file_poll(struct file *file, poll_table *wait)
 {
-        struct inode *inode = file->f_dentry->d_inode;
        struct fuse_file *ff = file->private_data;
-        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_conn *fc = ff->fc;
        struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
        struct fuse_poll_out outarg;
        struct fuse_req *req;
@@ -1870,7 +1925,7 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
                return PTR_ERR(req);
        req->in.h.opcode = FUSE_POLL;
-        req->in.h.nodeid = get_node_id(inode);
+        req->in.h.nodeid = ff->nodeid;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(inarg);
        req->in.args[0].value = &inarg;
@@ -1889,6 +1944,7 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
        }
        return POLLERR;
 }
+EXPORT_SYMBOL_GPL(fuse_file_poll);
 /*
 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 6fc5aedaa0d5..aaf2f9ff970e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -97,8 +97,13 @@ struct fuse_inode {
        struct list_head writepages;
 };
+struct fuse_conn;
 /** FUSE specific file data */
 struct fuse_file {
+        /** Fuse connection for this file */
+        struct fuse_conn *fc;
        /** Request reserved for flush and release */
        struct fuse_req *reserved_req;
@@ -108,9 +113,15 @@ struct fuse_file {
        /** File handle used by userspace */
        u64 fh;
+        /** Node id of this file */
+        u64 nodeid;
        /** Refcount */
        atomic_t count;
+        /** FOPEN_* flags returned by open */
+        u32 open_flags;
        /** Entry on inode's write_files list */
        struct list_head write_entry;
@@ -185,8 +196,6 @@ enum fuse_req_state {
        FUSE_REQ_FINISHED
 };
-struct fuse_conn;
 /**
 * A request to the client
 */
@@ -248,11 +257,12 @@ struct fuse_req {
                struct fuse_forget_in forget_in;
                struct {
                        struct fuse_release_in in;
-                        struct vfsmount *vfsmount;
+                        struct path path;
-                        struct dentry *dentry;
                } release;
                struct fuse_init_in init_in;
                struct fuse_init_out init_out;
+                struct cuse_init_in cuse_init_in;
+                struct cuse_init_out cuse_init_out;
                struct {
                        struct fuse_read_in in;
                        u64 attr_ver;
@@ -386,6 +396,9 @@ struct fuse_conn {
        /** Filesystem supports NFS exporting.  Only set in INIT */
        unsigned export_support:1;
+        /** Set if bdi is valid */
+        unsigned bdi_initialized:1;
        /*
         * The following bitfields are only for optimization purposes
         * and hence races in setting them will not cause malfunction
@@ -515,25 +528,24 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 * Initialize READ or READDIR request
 */
 void fuse_read_fill(struct fuse_req *req, struct file *file,
-                    struct inode *inode, loff_t pos, size_t count, int opcode);
+                    loff_t pos, size_t count, int opcode);
 /**
 * Send OPEN or OPENDIR request
 */
-int fuse_open_common(struct inode *inode, struct file *file, int isdir);
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_get(struct fuse_file *ff);
 void fuse_file_free(struct fuse_file *ff);
-void fuse_finish_open(struct inode *inode, struct file *file,
+void fuse_finish_open(struct inode *inode, struct file *file);
-                      struct fuse_file *ff, struct fuse_open_out *outarg);
-/** Fill in ff->reserved_req with a RELEASE request */
+void fuse_sync_release(struct fuse_file *ff, int flags);
-void fuse_release_fill(struct fuse_file *ff, u64 nodeid, int flags, int opcode);
 /**
 * Send RELEASE or RELEASEDIR request
 */
-int fuse_release_common(struct inode *inode, struct file *file, int isdir);
+void fuse_release_common(struct file *file, int opcode);
 /**
 * Send FSYNC or FSYNCDIR request
@@ -652,10 +664,12 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
 */
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+void fuse_conn_kill(struct fuse_conn *fc);
 /**
 * Initialize fuse_conn
 */
-int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+void fuse_conn_init(struct fuse_conn *fc);
 /**
 * Release reference to fuse_conn
@@ -694,4 +708,13 @@ void fuse_release_nowrite(struct inode *inode);
 u64 fuse_get_attr_version(struct fuse_conn *fc);
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+                 bool isdir);
+ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+                       size_t count, loff_t *ppos, int write);
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+                   unsigned int flags);
+unsigned fuse_file_poll(struct file *file, poll_table *wait);
+int fuse_dev_release(struct inode *inode, struct file *file);
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 91f7c85f1ffd..f0df55a52929 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -277,11 +277,14 @@ static void fuse_send_destroy(struct fuse_conn *fc)
        }
 }
-static void fuse_put_super(struct super_block *sb)
+static void fuse_bdi_destroy(struct fuse_conn *fc)
 {
-        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        if (fc->bdi_initialized)
+                bdi_destroy(&fc->bdi);
+}
-        fuse_send_destroy(fc);
+void fuse_conn_kill(struct fuse_conn *fc)
+{
        spin_lock(&fc->lock);
        fc->connected = 0;
        fc->blocked = 0;
@@ -295,7 +298,16 @@ static void fuse_put_super(struct super_block *sb)
        list_del(&fc->entry);
        fuse_ctl_remove_conn(fc);
        mutex_unlock(&fuse_mutex);
-        bdi_destroy(&fc->bdi);
+        fuse_bdi_destroy(fc);
+}
+EXPORT_SYMBOL_GPL(fuse_conn_kill);
+static void fuse_put_super(struct super_block *sb)
+{
+        struct fuse_conn *fc = get_fuse_conn_super(sb);
+        fuse_send_destroy(fc);
+        fuse_conn_kill(fc);
        fuse_conn_put(fc);
 }
@@ -466,10 +478,8 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
-int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
+void fuse_conn_init(struct fuse_conn *fc)
 {
-        int err;
        memset(fc, 0, sizeof(*fc));
        spin_lock_init(&fc->lock);
        mutex_init(&fc->inst_mutex);
@@ -484,49 +494,12 @@ int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
        INIT_LIST_HEAD(&fc->bg_queue);
        INIT_LIST_HEAD(&fc->entry);
        atomic_set(&fc->num_waiting, 0);
-        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-        fc->bdi.unplug_io_fn = default_unplug_io_fn;
-        /* fuse does it's own writeback accounting */
-        fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
        fc->khctr = 0;
        fc->polled_files = RB_ROOT;
-        fc->dev = sb->s_dev;
-        err = bdi_init(&fc->bdi);
-        if (err)
-                goto error_mutex_destroy;
-        if (sb->s_bdev) {
-                err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
-                                   MAJOR(fc->dev), MINOR(fc->dev));
-        } else {
-                err = bdi_register_dev(&fc->bdi, fc->dev);
-        }
-        if (err)
-                goto error_bdi_destroy;
-        /*
-         * For a single fuse filesystem use max 1% of dirty +
-         * writeback threshold.
-         *
-         * This gives about 1M of write buffer for memory maps on a
-         * machine with 1G and 10% dirty_ratio, which should be more
-         * than enough.
-         *
-         * Privileged users can raise it by writing to
-         *
-         *    /sys/class/bdi/<bdi>/max_ratio
-         */
-        bdi_set_max_ratio(&fc->bdi, 1);
        fc->reqctr = 0;
        fc->blocked = 1;
        fc->attr_version = 1;
        get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
-        return 0;
- error_bdi_destroy:
-        bdi_destroy(&fc->bdi);
- error_mutex_destroy:
-        mutex_destroy(&fc->inst_mutex);
-        return err;
 }
 EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -539,12 +512,14 @@ void fuse_conn_put(struct fuse_conn *fc)
                fc->release(fc);
        }
 }
+EXPORT_SYMBOL_GPL(fuse_conn_put);
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
 {
        atomic_inc(&fc->count);
        return fc;
 }
+EXPORT_SYMBOL_GPL(fuse_conn_get);
 static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 {
@@ -797,6 +772,48 @@ static void fuse_free_conn(struct fuse_conn *fc)
        kfree(fc);
 }
+static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
+{
+        int err;
+        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+        fc->bdi.unplug_io_fn = default_unplug_io_fn;
+        /* fuse does it's own writeback accounting */
+        fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+        err = bdi_init(&fc->bdi);
+        if (err)
+                return err;
+        fc->bdi_initialized = 1;
+        if (sb->s_bdev) {
+                err =  bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+                                    MAJOR(fc->dev), MINOR(fc->dev));
+        } else {
+                err = bdi_register_dev(&fc->bdi, fc->dev);
+        }
+        if (err)
+                return err;
+        /*
+         * For a single fuse filesystem use max 1% of dirty +
+         * writeback threshold.
+         *
+         * This gives about 1M of write buffer for memory maps on a
+         * machine with 1G and 10% dirty_ratio, which should be more
+         * than enough.
+         *
+         * Privileged users can raise it by writing to
+         *
+         *    /sys/class/bdi/<bdi>/max_ratio
+         */
+        bdi_set_max_ratio(&fc->bdi, 1);
+        return 0;
+}
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct fuse_conn *fc;
@@ -843,11 +860,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (!fc)
                goto err_fput;
-        err = fuse_conn_init(fc, sb);
+        fuse_conn_init(fc);
-        if (err) {
-                kfree(fc);
+        fc->dev = sb->s_dev;
-                goto err_fput;
+        err = fuse_bdi_init(fc, sb);
-        }
+        if (err)
+                goto err_put_conn;
        fc->release = fuse_free_conn;
        fc->flags = d.flags;
@@ -911,7 +929,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 err_put_root:
        dput(root_dentry);
 err_put_conn:
-        bdi_destroy(&fc->bdi);
+        fuse_bdi_destroy(fc);
        fuse_conn_put(fc);
 err_fput:
        fput(file);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3a981b7f64ca..cad957cdb1e5 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,6 +7,7 @@ config GFS2_FS
        select IP_SCTP if DLM_SCTP
        select FS_POSIX_ACL
        select CRC32
+        select SLOW_WORK
        help
          A cluster filesystem.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index a851ea4bdf70..3da2f1f4f738 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,8 +1,9 @@
+EXTRA_CFLAGS := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
        glops.o inode.o log.o lops.o main.o meta_io.o \
-        mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+        aops.o dentry.o export.o file.o \
-        ops_fstype.o ops_inode.o ops_super.o quota.o \
+        ops_fstype.o ops_inode.o quota.o \
        recovery.o rgrp.o super.o sys.o trans.o util.o
 gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/aops.c
index a6dde1751e17..03ebb439ace0 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/aops.c
@@ -28,7 +28,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "trans.h"
 #include "rgrp.h"
@@ -781,10 +780,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        unlock_page(page);
        page_cache_release(page);
-        if (inode->i_size < to) {
+        if (copied) {
-                i_size_write(inode, to);
+                if (inode->i_size < to) {
-                ip->i_disksize = inode->i_size;
+                        i_size_write(inode, to);
-                di->di_size = cpu_to_be64(inode->i_size);
+                        ip->i_disksize = inode->i_size;
+                }
+                gfs2_dinode_out(ip, di);
                mark_inode_dirty(inode);
        }
@@ -824,7 +825,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
-        struct gfs2_dinode *di;
        unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
        unsigned int to = from + len;
        int ret;
@@ -847,11 +847,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
                gfs2_page_add_databufs(ip, page, from, to);
        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+        if (ret > 0) {
-        if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
+                if (inode->i_size > ip->i_disksize)
-                di = (struct gfs2_dinode *)dibh->b_data;
+                        ip->i_disksize = inode->i_size;
-                ip->i_disksize = inode->i_size;
+                gfs2_dinode_out(ip, dibh->b_data);
-                di->di_size = cpu_to_be64(inode->i_size);
                mark_inode_dirty(inode);
        }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3a5d3f883e10..6d47379e794b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -25,7 +25,7 @@
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
-#include "ops_address.h"
+#include "trace_gfs2.h"
 /* This doesn't need to be that large as max 64 bit pointers in a 4k
 * block is 512, so __u16 is fine for that. It saves stack space to
@@ -136,7 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
                   and write it out to disk */
                unsigned int n = 1;
-                block = gfs2_alloc_block(ip, &n);
+                error = gfs2_alloc_block(ip, &block, &n);
+                if (error)
+                        goto out_brelse;
                if (isdir) {
                        gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
                        error = gfs2_dir_get_new_buffer(ip, block, &bh);
@@ -476,8 +478,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
        blks = dblks + iblks;
        i = sheight;
        do {
+                int error;
                n = blks - alloced;
-                bn = gfs2_alloc_block(ip, &n);
+                error = gfs2_alloc_block(ip, &bn, &n);
+                if (error)
+                        return error;
                alloced += n;
                if (state != ALLOC_DATA || gfs2_is_jdata(ip))
                        gfs2_trans_add_unrevoke(sdp, bn, n);
@@ -585,6 +590,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
        clear_buffer_mapped(bh_map);
        clear_buffer_new(bh_map);
        clear_buffer_boundary(bh_map);
+        trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
        if (gfs2_is_dir(ip)) {
                bsize = sdp->sd_jbsize;
                arr = sdp->sd_jheightsize;
@@ -619,6 +625,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
        ret = 0;
 out:
        release_metapath(&mp);
+        trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
        bmap_unlock(ip, create);
        return ret;
@@ -1008,7 +1015,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
                gfs2_trans_add_bh(ip->i_gl, bh, 0);
        zero_user(page, offset, length);
+        mark_buffer_dirty(bh);
 unlock:
        unlock_page(page);
        page_cache_release(page);
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..022c66cd5606 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/dentry.c
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index aef4d0c06748..297d7e5cebad 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        unsigned int n = 1;
-        u64 bn = gfs2_alloc_block(ip, &n);
+        u64 bn;
-        struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+        int error;
+        struct buffer_head *bh;
        struct gfs2_leaf *leaf;
        struct gfs2_dirent *dent;
        struct qstr name = { .name = "", .len = 0, .hash = 0 };
+        error = gfs2_alloc_block(ip, &bn, &n);
+        if (error)
+                return NULL;
+        bh = gfs2_meta_new(ip->i_gl, bn);
        if (!bh)
                return NULL;
        gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
        gfs2_trans_add_bh(ip->i_gl, bh, 1);
        gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 899763aed217..07ea9529adda 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
        struct gfs2_ea_header *ea;
        unsigned int n = 1;
        u64 block;
+        int error;
-        block = gfs2_alloc_block(ip, &n);
+        error = gfs2_alloc_block(ip, &block, &n);
+        if (error)
+                return error;
        gfs2_trans_add_unrevoke(sdp, block, 1);
        *bhp = gfs2_meta_new(ip->i_gl, block);
        gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                    struct gfs2_ea_request *er)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        int error;
        ea->ea_data_len = cpu_to_be32(er->er_data_len);
        ea->ea_name_len = er->er_name_len;
@@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                        int mh_size = sizeof(struct gfs2_meta_header);
                        unsigned int n = 1;
-                        block = gfs2_alloc_block(ip, &n);
+                        error = gfs2_alloc_block(ip, &block, &n);
+                        if (error)
+                                return error;
                        gfs2_trans_add_unrevoke(sdp, block, 1);
                        bh = gfs2_meta_new(ip->i_gl, block);
                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        } else {
                u64 blk;
                unsigned int n = 1;
-                blk = gfs2_alloc_block(ip, &n);
+                error = gfs2_alloc_block(ip, &blk, &n);
+                if (error)
+                        return error;
                gfs2_trans_add_unrevoke(sdp, blk, 1);
                indbh = gfs2_meta_new(ip->i_gl, blk);
                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/export.c
index 9200ef221716..9200ef221716 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/export.c
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/file.c
index 5d82e91887e3..73318a3ce6f1 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/file.c
@@ -39,7 +39,6 @@
 #include "trans.h"
 #include "util.h"
 #include "eaops.h"
-#include "ops_address.h"
 /**
 * gfs2_llseek - seek to a location in a file
@@ -425,33 +424,36 @@ static struct vm_operations_struct gfs2_vm_ops = {
        .page_mkwrite = gfs2_page_mkwrite,
 };
 /**
 * gfs2_mmap -
 * @file: The file to map
 * @vma: The VMA which described the mapping
 *
- * Returns: 0 or error code
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
 */
 static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-        struct gfs2_holder i_gh;
-        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+        if (!(file->f_flags & O_NOATIME)) {
-        error = gfs2_glock_nq(&i_gh);
+                struct gfs2_holder i_gh;
-        if (error) {
+                int error;
-                gfs2_holder_uninit(&i_gh);
-                return error;
-        }
+                gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+                error = gfs2_glock_nq(&i_gh);
+                file_accessed(file);
+                if (error == 0)
+                        gfs2_glock_dq_uninit(&i_gh);
+        }
        vma->vm_ops = &gfs2_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
-        gfs2_glock_dq_uninit(&i_gh);
+        return 0;
-        return error;
 }
 /**
@@ -692,12 +694,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-        struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
        if (!(fl->fl_flags & FL_FLOCK))
                return -ENOLCK;
-        if (__mandatory_lock(&ip->i_inode))
+        if (fl->fl_type & LOCK_MAND)
-                return -ENOLCK;
+                return -EOPNOTSUPP;
        if (fl->fl_type == F_UNLCK) {
                do_unflock(file, fl);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ff4981090489..297421c0427a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -39,6 +39,8 @@
 #include "super.h"
 #include "util.h"
 #include "bmap.h"
+#define CREATE_TRACE_POINTS
+#include "trace_gfs2.h"
 struct gfs2_gl_hash_bucket {
        struct hlist_head hb_list;
@@ -155,7 +157,7 @@ static void glock_free(struct gfs2_glock *gl)
        if (aspace)
                gfs2_aspace_put(aspace);
+        trace_gfs2_glock_put(gl);
        sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
 }
@@ -317,14 +319,17 @@ restart:
                                                return 2;
                                        gh->gh_error = ret;
                                        list_del_init(&gh->gh_list);
+                                        trace_gfs2_glock_queue(gh, 0);
                                        gfs2_holder_wake(gh);
                                        goto restart;
                                }
                                set_bit(HIF_HOLDER, &gh->gh_iflags);
+                                trace_gfs2_promote(gh, 1);
                                gfs2_holder_wake(gh);
                                goto restart;
                        }
                        set_bit(HIF_HOLDER, &gh->gh_iflags);
+                        trace_gfs2_promote(gh, 0);
                        gfs2_holder_wake(gh);
                        continue;
                }
@@ -354,6 +359,7 @@ static inline void do_error(struct gfs2_glock *gl, const int ret)
                else
                        continue;
                list_del_init(&gh->gh_list);
+                trace_gfs2_glock_queue(gh, 0);
                gfs2_holder_wake(gh);
        }
 }
@@ -422,6 +428,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
        int rv;
        spin_lock(&gl->gl_spin);
+        trace_gfs2_glock_state_change(gl, state);
        state_change(gl, state);
        gh = find_first_waiter(gl);
@@ -796,22 +803,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
        gh->gh_ip = 0;
 }
-static int just_schedule(void *word)
+/**
+ * gfs2_glock_holder_wait
+ * @word: unused
+ *
+ * This function and gfs2_glock_demote_wait both show up in the WCHAN
+ * field. Thus I've separated these otherwise identical functions in
+ * order to be more informative to the user.
+ */
+static int gfs2_glock_holder_wait(void *word)
 {
        schedule();
        return 0;
 }
+static int gfs2_glock_demote_wait(void *word)
+{
+        schedule();
+        return 0;
+}
 static void wait_on_holder(struct gfs2_holder *gh)
 {
        might_sleep();
-        wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
 }
 static void wait_on_demote(struct gfs2_glock *gl)
 {
        might_sleep();
-        wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE);
+        wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
 }
 /**
@@ -836,6 +858,7 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
                        gl->gl_demote_state != state) {
                gl->gl_demote_state = LM_ST_UNLOCKED;
        }
+        trace_gfs2_demote_rq(gl);
 }
 /**
@@ -921,6 +944,7 @@ fail:
                        goto do_cancel;
                return;
        }
+        trace_gfs2_glock_queue(gh, 1);
        list_add_tail(&gh->gh_list, insert_pt);
 do_cancel:
        gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1017,6 +1041,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
                    !test_bit(GLF_DEMOTE, &gl->gl_flags))
                        fast_path = 1;
        }
+        trace_gfs2_glock_queue(gh, 0);
        spin_unlock(&gl->gl_spin);
        if (likely(fast_path))
                return;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 70f87f43afa2..d5e4ab155ca0 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -310,24 +310,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 /**
- * rgrp_go_dump - print out an rgrp
- * @seq: The iterator
- * @gl: The glock in question
- *
- */
-static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
-{
-        const struct gfs2_rgrpd *rgd = gl->gl_object;
-        if (rgd == NULL)
-                return 0;
-        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
-                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
-        return 0;
-}
-/**
 * trans_go_sync - promote/demote the transaction glock
 * @gl: the glock
 * @state: the requested state
@@ -410,7 +392,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
        .go_demote_ok = rgrp_go_demote_ok,
        .go_lock = rgrp_go_lock,
        .go_unlock = rgrp_go_unlock,
-        .go_dump = rgrp_go_dump,
+        .go_dump = gfs2_rgrp_dump,
        .go_type = LM_TYPE_RGRP,
        .go_min_hold_time = HZ / 5,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 399d1b978049..225347fbff3c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/slow-work.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
@@ -63,9 +64,12 @@ struct gfs2_log_element {
        const struct gfs2_log_operations *le_ops;
 };
+#define GBF_FULL 1
 struct gfs2_bitmap {
        struct buffer_head *bi_bh;
        char *bi_clone;
+        unsigned long bi_flags;
        u32 bi_offset;
        u32 bi_start;
        u32 bi_len;
@@ -90,10 +94,11 @@ struct gfs2_rgrpd {
        struct gfs2_sbd *rd_sbd;
        unsigned int rd_bh_count;
        u32 rd_last_alloc;
-        unsigned char rd_flags;
+        u32 rd_flags;
-#define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
+#define GFS2_RDF_CHECK          0x10000000 /* check for unlinked inodes */
-#define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
+#define GFS2_RDF_UPTODATE       0x20000000 /* rg is up to date */
-#define GFS2_RDF_UPTODATE     0x04      /* rg is up to date */
+#define GFS2_RDF_ERROR          0x40000000 /* error in rg */
+#define GFS2_RDF_MASK           0xf0000000 /* mask for internal flags */
 };
 enum gfs2_state_bits {
@@ -376,11 +381,11 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
        struct list_head jd_list;
        struct list_head extent_list;
+        struct slow_work jd_work;
        struct inode *jd_inode;
+        unsigned long jd_flags;
+#define JDF_RECOVERY 1
        unsigned int jd_jid;
-        int jd_dirty;
        unsigned int jd_blocks;
 };
@@ -390,9 +395,6 @@ struct gfs2_statfs_change_host {
        s64 sc_dinodes;
 };
-#define GFS2_GLOCKD_DEFAULT     1
-#define GFS2_GLOCKD_MAX         16
 #define GFS2_QUOTA_DEFAULT      GFS2_QUOTA_OFF
 #define GFS2_QUOTA_OFF          0
 #define GFS2_QUOTA_ACCOUNT      1
@@ -418,6 +420,7 @@ struct gfs2_args {
        unsigned int ar_data:2;                 /* ordered/writeback */
        unsigned int ar_meta:1;                 /* mount metafs */
        unsigned int ar_discard:1;              /* discard requests */
+        int ar_commit;                          /* Commit interval */
 };
 struct gfs2_tune {
@@ -426,7 +429,6 @@ struct gfs2_tune {
        unsigned int gt_incore_log_blocks;
        unsigned int gt_log_flush_secs;
-        unsigned int gt_recoverd_secs;
        unsigned int gt_logd_secs;
        unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -447,6 +449,7 @@ enum {
        SDF_JOURNAL_LIVE        = 1,
        SDF_SHUTDOWN            = 2,
        SDF_NOBARRIERS          = 3,
+        SDF_NORECOVERY          = 4,
 };
 #define GFS2_FSNAME_LEN         256
@@ -493,7 +496,6 @@ struct lm_lockstruct {
        unsigned long ls_flags;
        dlm_lockspace_t *ls_dlm;
-        int ls_recover_jid;
        int ls_recover_jid_done;
        int ls_recover_jid_status;
 };
@@ -582,7 +584,6 @@ struct gfs2_sbd {
        /* Daemon stuff */
-        struct task_struct *sd_recoverd_process;
        struct task_struct *sd_logd_process;
        struct task_struct *sd_quotad_process;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5a31d426116f..2f94bd723698 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -30,7 +30,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
@@ -1047,154 +1046,7 @@ fail:
        return ERR_PTR(error);
 }
-/**
+static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
- * gfs2_rmdiri - Remove a directory
- * @dip: The parent directory of the directory to be removed
- * @name: The name of the directory to be removed
- * @ip: The GFS2 inode of the directory to be removed
- *
- * Assumes Glocks on dip and ip are held
- *
- * Returns: errno
- */
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-                struct gfs2_inode *ip)
-{
-        struct qstr dotname;
-        int error;
-        if (ip->i_entries != 2) {
-                if (gfs2_consist_inode(ip))
-                        gfs2_dinode_print(ip);
-                return -EIO;
-        }
-        error = gfs2_dir_del(dip, name);
-        if (error)
-                return error;
-        error = gfs2_change_nlink(dip, -1);
-        if (error)
-                return error;
-        gfs2_str2qstr(&dotname, ".");
-        error = gfs2_dir_del(ip, &dotname);
-        if (error)
-                return error;
-        gfs2_str2qstr(&dotname, "..");
-        error = gfs2_dir_del(ip, &dotname);
-        if (error)
-                return error;
-        /* It looks odd, but it really should be done twice */
-        error = gfs2_change_nlink(ip, -1);
-        if (error)
-                return error;
-        error = gfs2_change_nlink(ip, -1);
-        if (error)
-                return error;
-        return error;
-}
-/*
- * gfs2_unlink_ok - check to see that a inode is still in a directory
- * @dip: the directory
- * @name: the name of the file
- * @ip: the inode
- *
- * Assumes that the lock on (at least) @dip is held.
- *
- * Returns: 0 if the parent/child relationship is correct, errno if it isn't
- */
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-                   const struct gfs2_inode *ip)
-{
-        int error;
-        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-                return -EPERM;
-        if ((dip->i_inode.i_mode & S_ISVTX) &&
-            dip->i_inode.i_uid != current_fsuid() &&
-            ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
-                return -EPERM;
-        if (IS_APPEND(&dip->i_inode))
-                return -EPERM;
-        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
-        if (error)
-                return error;
-        error = gfs2_dir_check(&dip->i_inode, name, ip);
-        if (error)
-                return error;
-        return 0;
-}
-/**
- * gfs2_readlinki - return the contents of a symlink
- * @ip: the symlink's inode
- * @buf: a pointer to the buffer to be filled
- * @len: a pointer to the length of @buf
- *
- * If @buf is too small, a piece of memory is kmalloc()ed and needs
- * to be freed by the caller.
- *
- * Returns: errno
- */
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
-{
-        struct gfs2_holder i_gh;
-        struct buffer_head *dibh;
-        unsigned int x;
-        int error;
-        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-        error = gfs2_glock_nq(&i_gh);
-        if (error) {
-                gfs2_holder_uninit(&i_gh);
-                return error;
-        }
-        if (!ip->i_disksize) {
-                gfs2_consist_inode(ip);
-                error = -EIO;
-                goto out;
-        }
-        error = gfs2_meta_inode_buffer(ip, &dibh);
-        if (error)
-                goto out;
-        x = ip->i_disksize + 1;
-        if (x > *len) {
-                *buf = kmalloc(x, GFP_NOFS);
-                if (!*buf) {
-                        error = -ENOMEM;
-                        goto out_brelse;
-                }
-        }
-        memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-        *len = x;
-out_brelse:
-        brelse(dibh);
-out:
-        gfs2_glock_dq_uninit(&i_gh);
-        return error;
-}
-static int
-__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
        struct buffer_head *dibh;
        int error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c30be2b66580..c341aaf67adb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -11,8 +11,16 @@
 #define __INODE_DOT_H__
 #include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
 #include "util.h"
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+                              struct file_ra_state *ra_state,
+                              char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
        return !ip->i_height;
@@ -73,30 +81,26 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
-void gfs2_set_iop(struct inode *inode);
+extern void gfs2_set_iop(struct inode *inode);
-struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
+extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-                                u64 no_addr, u64 no_formal_ino,
+                                       u64 no_addr, u64 no_formal_ino,
-                                int skip_freeing);
+                                       int skip_freeing);
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
-int gfs2_inode_refresh(struct gfs2_inode *ip);
+extern int gfs2_inode_refresh(struct gfs2_inode *ip);
-int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
-struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-                           int is_root);
+                                  int is_root);
-struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
-                           unsigned int mode, dev_t dev);
+                                  const struct qstr *name,
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                                  unsigned int mode, dev_t dev);
-                struct gfs2_inode *ip);
+extern int gfs2_permission(struct inode *inode, int mask);
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-                   const struct gfs2_inode *ip);
+extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-int gfs2_permission(struct inode *inode, int mask);
+extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+extern void gfs2_dinode_print(const struct gfs2_inode *ip);
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 98918a756410..13c6237c5f67 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -28,6 +28,7 @@
 #include "meta_io.h"
 #include "util.h"
 #include "dir.h"
+#include "trace_gfs2.h"
 #define PULL 1
@@ -120,7 +121,7 @@ __acquires(&sdp->sd_log_lock)
                        lock_buffer(bh);
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
-                                submit_bh(WRITE, bh);
+                                submit_bh(WRITE_SYNC_PLUG, bh);
                        } else {
                                unlock_buffer(bh);
                                brelse(bh);
@@ -313,6 +314,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
                gfs2_log_lock(sdp);
        }
        atomic_sub(blks, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, -blks);
        gfs2_log_unlock(sdp);
        mutex_unlock(&sdp->sd_log_reserve_mutex);
@@ -333,6 +335,7 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
        gfs2_log_lock(sdp);
        atomic_add(blks, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, blks);
        gfs2_assert_withdraw(sdp,
                             atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
        gfs2_log_unlock(sdp);
@@ -558,6 +561,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
        gfs2_log_lock(sdp);
        atomic_add(dist, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, dist);
        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
        gfs2_log_unlock(sdp);
@@ -604,7 +608,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
                goto skip_barrier;
        get_bh(bh);
-        submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+        submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
        wait_on_buffer(bh);
        if (buffer_eopnotsupp(bh)) {
                clear_buffer_eopnotsupp(bh);
@@ -664,7 +668,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
                lock_buffer(bh);
                if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
                        bh->b_end_io = end_buffer_write_sync;
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE_SYNC_PLUG, bh);
                } else {
                        unlock_buffer(bh);
                        brelse(bh);
@@ -715,6 +719,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
                up_write(&sdp->sd_log_flush_lock);
                return;
        }
+        trace_gfs2_log_flush(sdp, 1);
        ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
        INIT_LIST_HEAD(&ai->ai_ail1_list);
@@ -746,6 +751,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
        else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
                gfs2_log_lock(sdp);
                atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
+                trace_gfs2_log_blocks(sdp, -1);
                gfs2_log_unlock(sdp);
                log_write_header(sdp, 0, PULL);
        }
@@ -763,8 +769,7 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
                ai = NULL;
        }
        gfs2_log_unlock(sdp);
+        trace_gfs2_log_flush(sdp, 0);
-        sdp->sd_vfs->s_dirt = 0;
        up_write(&sdp->sd_log_flush_lock);
        kfree(ai);
@@ -788,6 +793,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
        unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
        atomic_add(unused, &sdp->sd_log_blks_free);
+        trace_gfs2_log_blocks(sdp, unused);
        gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
                             sdp->sd_jdesc->jd_blocks);
        sdp->sd_log_blks_reserved = reserved;
@@ -823,7 +829,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
        log_refund(sdp, tr);
        buf_lo_incore_commit(sdp, tr);
-        sdp->sd_vfs->s_dirt = 1;
        up_read(&sdp->sd_log_flush_lock);
        gfs2_log_lock(sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 80e4f5f898bb..9969ff062c5b 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,6 +13,8 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -25,6 +27,7 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "trace_gfs2.h"
 /**
 * gfs2_pin - Pin a buffer in memory
@@ -51,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
        if (bd->bd_ail)
                list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
        get_bh(bh);
+        trace_gfs2_pin(bd, 1);
 }
 /**
@@ -87,6 +91,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
        bd->bd_ail = ai;
        list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
        clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+        trace_gfs2_pin(bd, 0);
        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
 }
@@ -189,7 +194,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                }
                gfs2_log_unlock(sdp);
-                submit_bh(WRITE, bh);
+                submit_bh(WRITE_SYNC_PLUG, bh);
                gfs2_log_lock(sdp);
                n = 0;
@@ -199,7 +204,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
                        gfs2_log_unlock(sdp);
                        lock_buffer(bd2->bd_bh);
                        bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE_SYNC_PLUG, bh);
                        gfs2_log_lock(sdp);
                        if (++n >= num)
                                break;
@@ -341,7 +346,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
                sdp->sd_log_num_revoke--;
                if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-                        submit_bh(WRITE, bh);
+                        submit_bh(WRITE_SYNC_PLUG, bh);
                        bh = gfs2_log_get_buf(sdp);
                        mh = (struct gfs2_meta_header *)bh->b_data;
@@ -358,7 +363,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
        }
        gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
-        submit_bh(WRITE, bh);
+        submit_bh(WRITE_SYNC_PLUG, bh);
 }
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -560,7 +565,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
        ptr = bh_log_ptr(bh);
        
        get_bh(bh);
-        submit_bh(WRITE, bh);
+        submit_bh(WRITE_SYNC_PLUG, bh);
        gfs2_log_lock(sdp);
        while(!list_empty(list)) {
                bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -586,7 +591,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
                } else {
                        bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
                }
-                submit_bh(WRITE, bh1);
+                submit_bh(WRITE_SYNC_PLUG, bh1);
                gfs2_log_lock(sdp);
                ptr += 2;
        }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a6892ed0840a..eacd78a5d082 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
 #include <asm/atomic.h>
+#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void)
        if (error)
                goto fail_unregister;
+        error = slow_work_register_user();
+        if (error)
+                goto fail_slow;
        gfs2_register_debugfs();
        printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
        return 0;
+fail_slow:
+        unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
        unregister_filesystem(&gfs2_fs_type);
 fail:
@@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void)
        gfs2_unregister_debugfs();
        unregister_filesystem(&gfs2_fs_type);
        unregister_filesystem(&gfs2meta_fs_type);
+        slow_work_unregister_user();
        kmem_cache_destroy(gfs2_quotad_cachep);
        kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8d6f13256b26..cb8d7a93d5ec 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,19 +31,66 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "ops_address.h"
-static int aspace_get_block(struct inode *inode, sector_t lblock,
+static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
-                            struct buffer_head *bh_result, int create)
 {
-        gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
+        int err;
-        return -EOPNOTSUPP;
+        struct buffer_head *bh, *head;
-}
+        int nr_underway = 0;
+        int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
+                        WRITE_SYNC_PLUG : WRITE));
+        BUG_ON(!PageLocked(page));
+        BUG_ON(!page_has_buffers(page));
+        head = page_buffers(page);
+        bh = head;
+        do {
+                if (!buffer_mapped(bh))
+                        continue;
+                /*
+                 * If it's a fully non-blocking write attempt and we cannot
+                 * lock the buffer then redirty the page.  Note that this can
+                 * potentially cause a busy-wait loop from pdflush and kswapd
+                 * activity, but those code paths have their own higher-level
+                 * throttling.
+                 */
+                if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+                        lock_buffer(bh);
+                } else if (!trylock_buffer(bh)) {
+                        redirty_page_for_writepage(wbc, page);
+                        continue;
+                }
+                if (test_clear_buffer_dirty(bh)) {
+                        mark_buffer_async_write(bh);
+                } else {
+                        unlock_buffer(bh);
+                }
+        } while ((bh = bh->b_this_page) != head);
+        /*
+         * The page and its buffers are protected by PageWriteback(), so we can
+         * drop the bh refcounts early.
+         */
+        BUG_ON(PageWriteback(page));
+        set_page_writeback(page);
+        do {
+                struct buffer_head *next = bh->b_this_page;
+                if (buffer_async_write(bh)) {
+                        submit_bh(write_op, bh);
+                        nr_underway++;
+                }
+                bh = next;
+        } while (bh != head);
+        unlock_page(page);
-static int gfs2_aspace_writepage(struct page *page,
+        err = 0;
-                                 struct writeback_control *wbc)
+        if (nr_underway == 0)
-{
+                end_page_writeback(page);
-        return block_write_full_page(page, aspace_get_block, wbc);
+        return err;
 }
 static const struct address_space_operations aspace_aops = {
@@ -201,16 +248,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
                   struct buffer_head **bhp)
 {
-        *bhp = gfs2_getbuf(gl, blkno, CREATE);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
-        if (!buffer_uptodate(*bhp)) {
+        struct buffer_head *bh;
-                ll_rw_block(READ_META, 1, bhp);
-                if (flags & DIO_WAIT) {
+        if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-                        int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+                return -EIO;
-                        if (error) {
-                                brelse(*bhp);
+        *bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
-                                return error;
-                        }
+        lock_buffer(bh);
-                }
+        if (buffer_uptodate(bh)) {
+                unlock_buffer(bh);
+                return 0;
+        }
+        bh->b_end_io = end_buffer_read_sync;
+        get_bh(bh);
+        submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
+        if (!(flags & DIO_WAIT))
+                return 0;
+        wait_on_buffer(bh);
+        if (unlikely(!buffer_uptodate(bh))) {
+                struct gfs2_trans *tr = current->journal_info;
+                if (tr && tr->tr_touched)
+                        gfs2_io_error_bh(sdp, bh);
+                brelse(bh);
+                return -EIO;
        }
        return 0;
@@ -404,7 +467,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        if (buffer_uptodate(first_bh))
                goto out;
        if (!buffer_locked(first_bh))
-                ll_rw_block(READ_META, 1, &first_bh);
+                ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
        dblock++;
        extlen--;
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
deleted file mode 100644
index f7e8527a21e0..000000000000
--- a/fs/gfs2/mount.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/parser.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-enum {
-        Opt_lockproto,
-        Opt_locktable,
-        Opt_hostdata,
-        Opt_spectator,
-        Opt_ignore_local_fs,
-        Opt_localflocks,
-        Opt_localcaching,
-        Opt_debug,
-        Opt_nodebug,
-        Opt_upgrade,
-        Opt_acl,
-        Opt_noacl,
-        Opt_quota_off,
-        Opt_quota_account,
-        Opt_quota_on,
-        Opt_quota,
-        Opt_noquota,
-        Opt_suiddir,
-        Opt_nosuiddir,
-        Opt_data_writeback,
-        Opt_data_ordered,
-        Opt_meta,
-        Opt_discard,
-        Opt_nodiscard,
-        Opt_err,
-};
-static const match_table_t tokens = {
-        {Opt_lockproto, "lockproto=%s"},
-        {Opt_locktable, "locktable=%s"},
-        {Opt_hostdata, "hostdata=%s"},
-        {Opt_spectator, "spectator"},
-        {Opt_ignore_local_fs, "ignore_local_fs"},
-        {Opt_localflocks, "localflocks"},
-        {Opt_localcaching, "localcaching"},
-        {Opt_debug, "debug"},
-        {Opt_nodebug, "nodebug"},
-        {Opt_upgrade, "upgrade"},
-        {Opt_acl, "acl"},
-        {Opt_noacl, "noacl"},
-        {Opt_quota_off, "quota=off"},
-        {Opt_quota_account, "quota=account"},
-        {Opt_quota_on, "quota=on"},
-        {Opt_quota, "quota"},
-        {Opt_noquota, "noquota"},
-        {Opt_suiddir, "suiddir"},
-        {Opt_nosuiddir, "nosuiddir"},
-        {Opt_data_writeback, "data=writeback"},
-        {Opt_data_ordered, "data=ordered"},
-        {Opt_meta, "meta"},
-        {Opt_discard, "discard"},
-        {Opt_nodiscard, "nodiscard"},
-        {Opt_err, NULL}
-};
-/**
- * gfs2_mount_args - Parse mount options
- * @sdp:
- * @data:
- *
- * Return: errno
- */
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
-{
-        char *o;
-        int token;
-        substring_t tmp[MAX_OPT_ARGS];
-        /* Split the options into tokens with the "," character and
-           process them */
-        while (1) {
-                o = strsep(&options, ",");
-                if (o == NULL)
-                        break;
-                if (*o == '\0')
-                        continue;
-                token = match_token(o, tokens, tmp);
-                switch (token) {
-                case Opt_lockproto:
-                        match_strlcpy(args->ar_lockproto, &tmp[0],
-                                      GFS2_LOCKNAME_LEN);
-                        break;
-                case Opt_locktable:
-                        match_strlcpy(args->ar_locktable, &tmp[0],
-                                      GFS2_LOCKNAME_LEN);
-                        break;
-                case Opt_hostdata:
-                        match_strlcpy(args->ar_hostdata, &tmp[0],
-                                      GFS2_LOCKNAME_LEN);
-                        break;
-                case Opt_spectator:
-                        args->ar_spectator = 1;
-                        break;
-                case Opt_ignore_local_fs:
-                        args->ar_ignore_local_fs = 1;
-                        break;
-                case Opt_localflocks:
-                        args->ar_localflocks = 1;
-                        break;
-                case Opt_localcaching:
-                        args->ar_localcaching = 1;
-                        break;
-                case Opt_debug:
-                        args->ar_debug = 1;
-                        break;
-                case Opt_nodebug:
-                        args->ar_debug = 0;
-                        break;
-                case Opt_upgrade:
-                        args->ar_upgrade = 1;
-                        break;
-                case Opt_acl:
-                        args->ar_posix_acl = 1;
-                        break;
-                case Opt_noacl:
-                        args->ar_posix_acl = 0;
-                        break;
-                case Opt_quota_off:
-                case Opt_noquota:
-                        args->ar_quota = GFS2_QUOTA_OFF;
-                        break;
-                case Opt_quota_account:
-                        args->ar_quota = GFS2_QUOTA_ACCOUNT;
-                        break;
-                case Opt_quota_on:
-                case Opt_quota:
-                        args->ar_quota = GFS2_QUOTA_ON;
-                        break;
-                case Opt_suiddir:
-                        args->ar_suiddir = 1;
-                        break;
-                case Opt_nosuiddir:
-                        args->ar_suiddir = 0;
-                        break;
-                case Opt_data_writeback:
-                        args->ar_data = GFS2_DATA_WRITEBACK;
-                        break;
-                case Opt_data_ordered:
-                        args->ar_data = GFS2_DATA_ORDERED;
-                        break;
-                case Opt_meta:
-                        args->ar_meta = 1;
-                        break;
-                case Opt_discard:
-                        args->ar_discard = 1;
-                        break;
-                case Opt_nodiscard:
-                        args->ar_discard = 0;
-                        break;
-                case Opt_err:
-                default:
-                        fs_info(sdp, "invalid mount option: %s\n", o);
-                        return -EINVAL;
-                }
-        }
-        return 0;
-}
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
deleted file mode 100644
index 5da21285bba4..000000000000
--- a/fs/gfs2/ops_address.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __OPS_ADDRESS_DOT_H__
-#define __OPS_ADDRESS_DOT_H__
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/mm.h>
-extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
-extern int gfs2_internal_read(struct gfs2_inode *ip,
-                              struct file_ra_state *ra_state,
-                              char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_aops(struct inode *inode);
-#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea753..7bc3c45cd676 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,6 +17,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/slow-work.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -32,6 +33,7 @@
 #include "log.h"
 #include "quota.h"
 #include "dir.h"
+#include "trace_gfs2.h"
 #define DO 0
 #define UNDO 1
@@ -55,8 +57,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
        spin_lock_init(&gt->gt_spin);
        gt->gt_incore_log_blocks = 1024;
-        gt->gt_log_flush_secs = 60;
-        gt->gt_recoverd_secs = 60;
        gt->gt_logd_secs = 1;
        gt->gt_quota_simul_sync = 64;
        gt->gt_quota_warn_period = 10;
@@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
        }
        /* Set up the buffer cache and SB for real */
-        if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+        if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
                ret = -EINVAL;
                fs_err(sdp, "FS block size (%u) is too small for device "
                       "block size (%u)\n",
-                       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+                       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
                goto out;
        }
        if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
@@ -676,6 +676,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
                        break;
                INIT_LIST_HEAD(&jd->extent_list);
+                slow_work_init(&jd->jd_work, &gfs2_recover_ops);
                jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
                if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
                        if (!jd->jd_inode)
@@ -701,14 +702,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
        struct inode *master = sdp->sd_master_dir->d_inode;
        struct gfs2_holder ji_gh;
-        struct task_struct *p;
        struct gfs2_inode *ip;
        int jindex = 1;
        int error = 0;
        if (undo) {
                jindex = 0;
-                goto fail_recoverd;
+                goto fail_jinode_gh;
        }
        sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
@@ -776,6 +776,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
                /* Map the extents for this journal's blocks */
                map_journal_extents(sdp);
        }
+        trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
        if (sdp->sd_lockstruct.ls_first) {
                unsigned int x;
@@ -801,18 +802,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
        gfs2_glock_dq_uninit(&ji_gh);
        jindex = 0;
-        p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
-        error = IS_ERR(p);
-        if (error) {
-                fs_err(sdp, "can't start recoverd thread: %d\n", error);
-                goto fail_jinode_gh;
-        }
-        sdp->sd_recoverd_process = p;
        return 0;
-fail_recoverd:
-        kthread_stop(sdp->sd_recoverd_process);
 fail_jinode_gh:
        if (!sdp->sd_args.ar_spectator)
                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
@@ -1165,6 +1156,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
        sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
+        sdp->sd_args.ar_commit = 60;
        error = gfs2_mount_args(sdp, &sdp->sd_args, data);
        if (error) {
@@ -1172,8 +1164,10 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                goto fail;
        }
-        if (sdp->sd_args.ar_spectator)
+        if (sdp->sd_args.ar_spectator) {
                sb->s_flags |= MS_RDONLY;
+                set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+        }
        if (sdp->sd_args.ar_posix_acl)
                sb->s_flags |= MS_POSIXACL;
@@ -1191,6 +1185,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                               GFS2_BASIC_BLOCK_SHIFT;
        sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+        sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
        error = init_names(sdp, silent);
        if (error)
                goto fail;
@@ -1279,9 +1275,22 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
        return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
 }
-static struct super_block *get_gfs2_sb(const char *dev_name)
+static int test_meta_super(struct super_block *s, void *ptr)
+{
+        struct block_device *bdev = ptr;
+        return (bdev == s->s_bdev);
+}
+static int set_meta_super(struct super_block *s, void *ptr)
 {
-        struct super_block *sb;
+        return -EINVAL;
+}
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+                            const char *dev_name, void *data, struct vfsmount *mnt)
+{
+        struct super_block *s;
+        struct gfs2_sbd *sdp;
        struct path path;
        int error;
@@ -1289,30 +1298,17 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
        if (error) {
                printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
                       dev_name, error);
-                return NULL;
+                return error;
        }
-        sb = path.dentry->d_inode->i_sb;
+        s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
-        if (sb && (sb->s_type == &gfs2_fs_type))
+                 path.dentry->d_inode->i_sb->s_bdev);
-                atomic_inc(&sb->s_active);
-        else
-                sb = NULL;
        path_put(&path);
-        return sb;
+        if (IS_ERR(s)) {
-}
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
-                            const char *dev_name, void *data, struct vfsmount *mnt)
-{
-        struct super_block *sb = NULL;
-        struct gfs2_sbd *sdp;
-        sb = get_gfs2_sb(dev_name);
-        if (!sb) {
                printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-                return -ENOENT;
+                return PTR_ERR(s);
        }
-        sdp = sb->s_fs_info;
+        sdp = s->s_fs_info;
-        mnt->mnt_sb = sb;
+        mnt->mnt_sb = s;
        mnt->mnt_root = dget(sdp->sd_master_dir);
        return 0;
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1c70fa5168d6..f8bd20baf99c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -262,6 +262,44 @@ out_parent:
        return error;
 }
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+                          const struct gfs2_inode *ip)
+{
+        int error;
+        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+                return -EPERM;
+        if ((dip->i_inode.i_mode & S_ISVTX) &&
+            dip->i_inode.i_uid != current_fsuid() &&
+            ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (IS_APPEND(&dip->i_inode))
+                return -EPERM;
+        error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+        if (error)
+                return error;
+        error = gfs2_dir_check(&dip->i_inode, name, ip);
+        if (error)
+                return error;
+        return 0;
+}
 /**
 * gfs2_unlink - Unlink a file
 * @dir: The inode of the directory containing the file to unlink
@@ -473,6 +511,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 /**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+                       struct gfs2_inode *ip)
+{
+        struct qstr dotname;
+        int error;
+        if (ip->i_entries != 2) {
+                if (gfs2_consist_inode(ip))
+                        gfs2_dinode_print(ip);
+                return -EIO;
+        }
+        error = gfs2_dir_del(dip, name);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(dip, -1);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, ".");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        gfs2_str2qstr(&dotname, "..");
+        error = gfs2_dir_del(ip, &dotname);
+        if (error)
+                return error;
+        /* It looks odd, but it really should be done twice */
+        error = gfs2_change_nlink(ip, -1);
+        if (error)
+                return error;
+        error = gfs2_change_nlink(ip, -1);
+        if (error)
+                return error;
+        return error;
+}
+/**
 * gfs2_rmdir - Remove a directory
 * @dir: The parent directory of the directory to be removed
 * @dentry: The dentry of the directory to remove
@@ -885,6 +976,61 @@ out:
 }
 /**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+        struct gfs2_holder i_gh;
+        struct buffer_head *dibh;
+        unsigned int x;
+        int error;
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+        error = gfs2_glock_nq(&i_gh);
+        if (error) {
+                gfs2_holder_uninit(&i_gh);
+                return error;
+        }
+        if (!ip->i_disksize) {
+                gfs2_consist_inode(ip);
+                error = -EIO;
+                goto out;
+        }
+        error = gfs2_meta_inode_buffer(ip, &dibh);
+        if (error)
+                goto out;
+        x = ip->i_disksize + 1;
+        if (x > *len) {
+                *buf = kmalloc(x, GFP_NOFS);
+                if (!*buf) {
+                        error = -ENOMEM;
+                        goto out_brelse;
+                }
+        }
+        memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+        *len = x;
+out_brelse:
+        brelse(dibh);
+out:
+        gfs2_glock_dq_uninit(&i_gh);
+        return error;
+}
+/**
 * gfs2_readlink - Read the value of a symlink
 * @dentry: the symlink
 * @buf: the buffer to read the symlink data into
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
deleted file mode 100644
index 458019569dcb..000000000000
--- a/fs/gfs2/ops_super.c
+++ /dev/null
@@ -1,723 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/statfs.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-#include <linux/time.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "inode.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "rgrp.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-#include "trans.h"
-#include "dir.h"
-#include "eattr.h"
-#include "bmap.h"
-#include "meta_io.h"
-#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
-/**
- * gfs2_write_inode - Make sure the inode is stable on the disk
- * @inode: The inode
- * @sync: synchronous write flag
- *
- * Returns: errno
- */
-static int gfs2_write_inode(struct inode *inode, int sync)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
-        struct gfs2_holder gh;
-        struct buffer_head *bh;
-        struct timespec atime;
-        struct gfs2_dinode *di;
-        int ret = 0;
-        /* Check this is a "normal" inode, etc */
-        if (!test_bit(GIF_USER, &ip->i_flags) ||
-            (current->flags & PF_MEMALLOC))
-                return 0;
-        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        if (ret)
-                goto do_flush;
-        ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-        if (ret)
-                goto do_unlock;
-        ret = gfs2_meta_inode_buffer(ip, &bh);
-        if (ret == 0) {
-                di = (struct gfs2_dinode *)bh->b_data;
-                atime.tv_sec = be64_to_cpu(di->di_atime);
-                atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
-                if (timespec_compare(&inode->i_atime, &atime) > 0) {
-                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                        gfs2_dinode_out(ip, bh->b_data);
-                }
-                brelse(bh);
-        }
-        gfs2_trans_end(sdp);
-do_unlock:
-        gfs2_glock_dq_uninit(&gh);
-do_flush:
-        if (sync != 0)
-                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
-        return ret;
-}
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-        struct gfs2_holder t_gh;
-        int error;
-        gfs2_quota_sync(sdp);
-        gfs2_statfs_sync(sdp);
-        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-                                   &t_gh);
-        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return error;
-        gfs2_meta_syncfs(sdp);
-        gfs2_log_shutdown(sdp);
-        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-        if (t_gh.gh_gl)
-                gfs2_glock_dq_uninit(&t_gh);
-        gfs2_quota_cleanup(sdp);
-        return error;
-}
-/**
- * gfs2_put_super - Unmount the filesystem
- * @sb: The VFS superblock
- *
- */
-static void gfs2_put_super(struct super_block *sb)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        int error;
-        /*  Unfreeze the filesystem, if we need to  */
-        mutex_lock(&sdp->sd_freeze_lock);
-        if (sdp->sd_freeze_count)
-                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-        mutex_unlock(&sdp->sd_freeze_lock);
-        kthread_stop(sdp->sd_quotad_process);
-        kthread_stop(sdp->sd_logd_process);
-        kthread_stop(sdp->sd_recoverd_process);
-        if (!(sb->s_flags & MS_RDONLY)) {
-                error = gfs2_make_fs_ro(sdp);
-                if (error)
-                        gfs2_io_error(sdp);
-        }
-        /*  At this point, we're through modifying the disk  */
-        /*  Release stuff  */
-        iput(sdp->sd_jindex);
-        iput(sdp->sd_inum_inode);
-        iput(sdp->sd_statfs_inode);
-        iput(sdp->sd_rindex);
-        iput(sdp->sd_quota_inode);
-        gfs2_glock_put(sdp->sd_rename_gl);
-        gfs2_glock_put(sdp->sd_trans_gl);
-        if (!sdp->sd_args.ar_spectator) {
-                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-                iput(sdp->sd_ir_inode);
-                iput(sdp->sd_sc_inode);
-                iput(sdp->sd_qc_inode);
-        }
-        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
-        gfs2_clear_rgrpd(sdp);
-        gfs2_jindex_free(sdp);
-        /*  Take apart glock structures and buffer lists  */
-        gfs2_gl_hash_clear(sdp);
-        /*  Unmount the locking protocol  */
-        gfs2_lm_unmount(sdp);
-        /*  At this point, we're through participating in the lockspace  */
-        gfs2_sys_fs_del(sdp);
-}
-/**
- * gfs2_write_super
- * @sb: the superblock
- *
- */
-static void gfs2_write_super(struct super_block *sb)
-{
-        sb->s_dirt = 0;
-}
-/**
- * gfs2_sync_fs - sync the filesystem
- * @sb: the superblock
- *
- * Flushes the log to disk.
- */
-static int gfs2_sync_fs(struct super_block *sb, int wait)
-{
-        sb->s_dirt = 0;
-        if (wait && sb->s_fs_info)
-                gfs2_log_flush(sb->s_fs_info, NULL);
-        return 0;
-}
-/**
- * gfs2_freeze - prevent further writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-static int gfs2_freeze(struct super_block *sb)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        int error;
-        if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-                return -EINVAL;
-        for (;;) {
-                error = gfs2_freeze_fs(sdp);
-                if (!error)
-                        break;
-                switch (error) {
-                case -EBUSY:
-                        fs_err(sdp, "waiting for recovery before freeze\n");
-                        break;
-                default:
-                        fs_err(sdp, "error freezing FS: %d\n", error);
-                        break;
-                }
-                fs_err(sdp, "retrying...\n");
-                msleep(1000);
-        }
-        return 0;
-}
-/**
- * gfs2_unfreeze - reallow writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-static int gfs2_unfreeze(struct super_block *sb)
-{
-        gfs2_unfreeze_fs(sb->s_fs_info);
-        return 0;
-}
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-                            struct gfs2_statfs_change_host *sc)
-{
-        gfs2_rgrp_verify(rgd);
-        sc->sc_total += rgd->rd_data;
-        sc->sc_free += rgd->rd_free;
-        sc->sc_dinodes += rgd->rd_dinodes;
-        return 0;
-}
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_holder ri_gh;
-        struct gfs2_rgrpd *rgd_next;
-        struct gfs2_holder *gha, *gh;
-        unsigned int slots = 64;
-        unsigned int x;
-        int done;
-        int error = 0, err;
-        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-        if (!gha)
-                return -ENOMEM;
-        error = gfs2_rindex_hold(sdp, &ri_gh);
-        if (error)
-                goto out;
-        rgd_next = gfs2_rgrpd_get_first(sdp);
-        for (;;) {
-                done = 1;
-                for (x = 0; x < slots; x++) {
-                        gh = gha + x;
-                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
-                                err = gfs2_glock_wait(gh);
-                                if (err) {
-                                        gfs2_holder_uninit(gh);
-                                        error = err;
-                                } else {
-                                        if (!error)
-                                                error = statfs_slow_fill(
-                                                        gh->gh_gl->gl_object, sc);
-                                        gfs2_glock_dq_uninit(gh);
-                                }
-                        }
-                        if (gh->gh_gl)
-                                done = 0;
-                        else if (rgd_next && !error) {
-                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
-                                                           LM_ST_SHARED,
-                                                           GL_ASYNC,
-                                                           gh);
-                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
-                                done = 0;
-                        }
-                        if (signal_pending(current))
-                                error = -ERESTARTSYS;
-                }
-                if (done)
-                        break;
-                yield();
-        }
-        gfs2_glock_dq_uninit(&ri_gh);
-out:
-        kfree(gha);
-        return error;
-}
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-        spin_lock(&sdp->sd_statfs_spin);
-        *sc = *m_sc;
-        sc->sc_total += l_sc->sc_total;
-        sc->sc_free += l_sc->sc_free;
-        sc->sc_dinodes += l_sc->sc_dinodes;
-        spin_unlock(&sdp->sd_statfs_spin);
-        if (sc->sc_free < 0)
-                sc->sc_free = 0;
-        if (sc->sc_free > sc->sc_total)
-                sc->sc_free = sc->sc_total;
-        if (sc->sc_dinodes < 0)
-                sc->sc_dinodes = 0;
-        return 0;
-}
-/**
- * gfs2_statfs - Gather and return stats about the filesystem
- * @sb: The superblock
- * @statfsbuf: The buffer
- *
- * Returns: 0 on success or error code
- */
-static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-        struct super_block *sb = dentry->d_inode->i_sb;
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_statfs_change_host sc;
-        int error;
-        if (gfs2_tune_get(sdp, gt_statfs_slow))
-                error = gfs2_statfs_slow(sdp, &sc);
-        else
-                error = gfs2_statfs_i(sdp, &sc);
-        if (error)
-                return error;
-        buf->f_type = GFS2_MAGIC;
-        buf->f_bsize = sdp->sd_sb.sb_bsize;
-        buf->f_blocks = sc.sc_total;
-        buf->f_bfree = sc.sc_free;
-        buf->f_bavail = sc.sc_free;
-        buf->f_files = sc.sc_dinodes + sc.sc_free;
-        buf->f_ffree = sc.sc_free;
-        buf->f_namelen = GFS2_FNAMESIZE;
-        return 0;
-}
-/**
- * gfs2_remount_fs - called when the FS is remounted
- * @sb:  the filesystem
- * @flags:  the remount flags
- * @data:  extra data passed in (not used right now)
- *
- * Returns: errno
- */
-static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_args args = sdp->sd_args; /* Default to current settings */
-        int error;
-        error = gfs2_mount_args(sdp, &args, data);
-        if (error)
-                return error;
-        /* Not allowed to change locking details */
-        if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
-            strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
-            strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
-                return -EINVAL;
-        /* Some flags must not be changed */
-        if (args_neq(&args, &sdp->sd_args, spectator) ||
-            args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
-            args_neq(&args, &sdp->sd_args, localflocks) ||
-            args_neq(&args, &sdp->sd_args, localcaching) ||
-            args_neq(&args, &sdp->sd_args, meta))
-                return -EINVAL;
-        if (sdp->sd_args.ar_spectator)
-                *flags |= MS_RDONLY;
-        if ((sb->s_flags ^ *flags) & MS_RDONLY) {
-                if (*flags & MS_RDONLY)
-                        error = gfs2_make_fs_ro(sdp);
-                else
-                        error = gfs2_make_fs_rw(sdp);
-                if (error)
-                        return error;
-        }
-        sdp->sd_args = args;
-        if (sdp->sd_args.ar_posix_acl)
-                sb->s_flags |= MS_POSIXACL;
-        else
-                sb->s_flags &= ~MS_POSIXACL;
-        return 0;
-}
-/**
- * gfs2_drop_inode - Drop an inode (test for remote unlink)
- * @inode: The inode to drop
- *
- * If we've received a callback on an iopen lock then its because a
- * remote node tried to deallocate the inode but failed due to this node
- * still having the inode open. Here we mark the link count zero
- * since we know that it must have reached zero if the GLF_DEMOTE flag
- * is set on the iopen glock. If we didn't do a disk read since the
- * remote node removed the final link then we might otherwise miss
- * this event. This check ensures that this node will deallocate the
- * inode's blocks, or alternatively pass the baton on to another
- * node for later deallocation.
- */
-static void gfs2_drop_inode(struct inode *inode)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
-                struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
-                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
-                        clear_nlink(inode);
-        }
-        generic_drop_inode(inode);
-}
-/**
- * gfs2_clear_inode - Deallocate an inode when VFS is done with it
- * @inode: The VFS inode
- *
- */
-static void gfs2_clear_inode(struct inode *inode)
-{
-        struct gfs2_inode *ip = GFS2_I(inode);
-        /* This tells us its a "real" inode and not one which only
-         * serves to contain an address space (see rgrp.c, meta_io.c)
-         * which therefore doesn't have its own glocks.
-         */
-        if (test_bit(GIF_USER, &ip->i_flags)) {
-                ip->i_gl->gl_object = NULL;
-                gfs2_glock_put(ip->i_gl);
-                ip->i_gl = NULL;
-                if (ip->i_iopen_gh.gh_gl) {
-                        ip->i_iopen_gh.gh_gl->gl_object = NULL;
-                        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-                }
-        }
-}
-static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
-{
-        do {
-                if (d1 == d2)
-                        return 1;
-                d1 = d1->d_parent;
-        } while (!IS_ROOT(d1));
-        return 0;
-}
-/**
- * gfs2_show_options - Show mount options for /proc/mounts
- * @s: seq_file structure
- * @mnt: vfsmount
- *
- * Returns: 0 on success or error code
- */
-static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
-{
-        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
-        struct gfs2_args *args = &sdp->sd_args;
-        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
-                seq_printf(s, ",meta");
-        if (args->ar_lockproto[0])
-                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
-        if (args->ar_locktable[0])
-                seq_printf(s, ",locktable=%s", args->ar_locktable);
-        if (args->ar_hostdata[0])
-                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
-        if (args->ar_spectator)
-                seq_printf(s, ",spectator");
-        if (args->ar_ignore_local_fs)
-                seq_printf(s, ",ignore_local_fs");
-        if (args->ar_localflocks)
-                seq_printf(s, ",localflocks");
-        if (args->ar_localcaching)
-                seq_printf(s, ",localcaching");
-        if (args->ar_debug)
-                seq_printf(s, ",debug");
-        if (args->ar_upgrade)
-                seq_printf(s, ",upgrade");
-        if (args->ar_posix_acl)
-                seq_printf(s, ",acl");
-        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
-                char *state;
-                switch (args->ar_quota) {
-                case GFS2_QUOTA_OFF:
-                        state = "off";
-                        break;
-                case GFS2_QUOTA_ACCOUNT:
-                        state = "account";
-                        break;
-                case GFS2_QUOTA_ON:
-                        state = "on";
-                        break;
-                default:
-                        state = "unknown";
-                        break;
-                }
-                seq_printf(s, ",quota=%s", state);
-        }
-        if (args->ar_suiddir)
-                seq_printf(s, ",suiddir");
-        if (args->ar_data != GFS2_DATA_DEFAULT) {
-                char *state;
-                switch (args->ar_data) {
-                case GFS2_DATA_WRITEBACK:
-                        state = "writeback";
-                        break;
-                case GFS2_DATA_ORDERED:
-                        state = "ordered";
-                        break;
-                default:
-                        state = "unknown";
-                        break;
-                }
-                seq_printf(s, ",data=%s", state);
-        }
-        if (args->ar_discard)
-                seq_printf(s, ",discard");
-        return 0;
-}
-/*
- * We have to (at the moment) hold the inodes main lock to cover
- * the gap between unlocking the shared lock on the iopen lock and
- * taking the exclusive lock. I'd rather do a shared -> exclusive
- * conversion on the iopen lock, but we can change that later. This
- * is safe, just less efficient.
- */
-static void gfs2_delete_inode(struct inode *inode)
-{
-        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-        struct gfs2_inode *ip = GFS2_I(inode);
-        struct gfs2_holder gh;
-        int error;
-        if (!test_bit(GIF_USER, &ip->i_flags))
-                goto out;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        if (unlikely(error)) {
-                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-                goto out;
-        }
-        gfs2_glock_dq_wait(&ip->i_iopen_gh);
-        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
-        error = gfs2_glock_nq(&ip->i_iopen_gh);
-        if (error)
-                goto out_truncate;
-        if (S_ISDIR(inode->i_mode) &&
-            (ip->i_diskflags & GFS2_DIF_EXHASH)) {
-                error = gfs2_dir_exhash_dealloc(ip);
-                if (error)
-                        goto out_unlock;
-        }
-        if (ip->i_eattr) {
-                error = gfs2_ea_dealloc(ip);
-                if (error)
-                        goto out_unlock;
-        }
-        if (!gfs2_is_stuffed(ip)) {
-                error = gfs2_file_dealloc(ip);
-                if (error)
-                        goto out_unlock;
-        }
-        error = gfs2_dinode_dealloc(ip);
-        if (error)
-                goto out_unlock;
-out_truncate:
-        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-        if (error)
-                goto out_unlock;
-        /* Needs to be done before glock release & also in a transaction */
-        truncate_inode_pages(&inode->i_data, 0);
-        gfs2_trans_end(sdp);
-out_unlock:
-        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
-                gfs2_glock_dq(&ip->i_iopen_gh);
-        gfs2_holder_uninit(&ip->i_iopen_gh);
-        gfs2_glock_dq_uninit(&gh);
-        if (error && error != GLR_TRYFAILED)
-                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
-out:
-        truncate_inode_pages(&inode->i_data, 0);
-        clear_inode(inode);
-}
-static struct inode *gfs2_alloc_inode(struct super_block *sb)
-{
-        struct gfs2_inode *ip;
-        ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
-        if (ip) {
-                ip->i_flags = 0;
-                ip->i_gl = NULL;
-        }
-        return &ip->i_inode;
-}
-static void gfs2_destroy_inode(struct inode *inode)
-{
-        kmem_cache_free(gfs2_inode_cachep, inode);
-}
-const struct super_operations gfs2_super_ops = {
-        .alloc_inode            = gfs2_alloc_inode,
-        .destroy_inode          = gfs2_destroy_inode,
-        .write_inode            = gfs2_write_inode,
-        .delete_inode           = gfs2_delete_inode,
-        .put_super              = gfs2_put_super,
-        .write_super            = gfs2_write_super,
-        .sync_fs                = gfs2_sync_fs,
-        .freeze_fs              = gfs2_freeze,
-        .unfreeze_fs            = gfs2_unfreeze,
-        .statfs                 = gfs2_statfs,
-        .remount_fs             = gfs2_remount_fs,
-        .clear_inode            = gfs2_clear_inode,
-        .drop_inode             = gfs2_drop_inode,
-        .show_options           = gfs2_show_options,
-};
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 152e6c4a0dca..2e9b9326bfc9 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -60,7 +60,6 @@
 #include "super.h"
 #include "trans.h"
 #include "inode.h"
-#include "ops_address.h"
 #include "util.h"
 #define QUOTA_USER 1
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 247e8f7d6b3d..59d2695509d3 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,8 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/kthread.h>
+#include <linux/slow-work.h>
-#include <linux/freezer.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
-/**
+static int gfs2_recover_get_ref(struct slow_work *work)
- * gfs2_recover_journal - recover a given journal
+{
- * @jd: the struct gfs2_jdesc describing the journal
+        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
- *
+        if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
- * Acquire the journal's lock, check to see if the journal is clean, and
+                return -EBUSY;
- * do recovery if necessary.
+        return 0;
- *
+}
- * Returns: errno
- */
-int gfs2_recover_journal(struct gfs2_jdesc *jd)
+static void gfs2_recover_put_ref(struct slow_work *work)
+{
+        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+        clear_bit(JDF_RECOVERY, &jd->jd_flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
+}
+static void gfs2_recover_work(struct slow_work *work)
 {
+        struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
        struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
        struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
        struct gfs2_log_header_host head;
@@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
                gfs2_glock_dq_uninit(&j_gh);
        fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
-        return 0;
+        return;
 fail_gunlock_tr:
        gfs2_glock_dq_uninit(&t_gh);
@@ -584,70 +590,28 @@ fail_gunlock_j:
 fail:
        gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
-        return error;
 }
-static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+struct slow_work_ops gfs2_recover_ops = {
-{
+        .get_ref = gfs2_recover_get_ref,
-        struct gfs2_jdesc *jd;
+        .put_ref = gfs2_recover_put_ref,
-        int found = 0;
+        .execute = gfs2_recover_work,
+};
-        spin_lock(&sdp->sd_jindex_spin);
-        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-                if (jd->jd_dirty) {
-                        jd->jd_dirty = 0;
-                        found = 1;
-                        break;
-                }
-        }
-        spin_unlock(&sdp->sd_jindex_spin);
-        if (!found)
-                jd = NULL;
-        return jd;
+static int gfs2_recovery_wait(void *word)
-}
-/**
- * gfs2_check_journals - Recover any dirty journals
- * @sdp: the filesystem
- *
- */
-static void gfs2_check_journals(struct gfs2_sbd *sdp)
 {
-        struct gfs2_jdesc *jd;
+        schedule();
+        return 0;
-        for (;;) {
-                jd = gfs2_jdesc_find_dirty(sdp);
-                if (!jd)
-                        break;
-                if (jd != sdp->sd_jdesc)
-                        gfs2_recover_journal(jd);
-        }
 }
-/**
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-int gfs2_recoverd(void *data)
 {
-        struct gfs2_sbd *sdp = data;
+        int rv;
-        unsigned long t;
+        rv = slow_work_enqueue(&jd->jd_work);
+        if (rv)
-        while (!kthread_should_stop()) {
+                return rv;
-                gfs2_check_journals(sdp);
+        wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
-                t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-                if (freezing(current))
-                        refrigerator();
-                schedule_timeout_interruptible(t);
-        }
        return 0;
 }
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index a8218ea15b57..1616ac22569a 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
                    struct gfs2_log_header_host *head);
 extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-extern int gfs2_recoverd(void *data);
+extern struct slow_work_ops gfs2_recover_ops;
 #endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa2..daa4ae341a29 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -29,7 +29,7 @@
 #include "util.h"
 #include "log.h"
 #include "inode.h"
-#include "ops_address.h"
+#include "trace_gfs2.h"
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
@@ -442,6 +442,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
        for (x = 0; x < length; x++) {
                bi = rgd->rd_bits + x;
+                bi->bi_flags = 0;
                /* small rgrp; bitmap stored completely in header block */
                if (length == 1) {
                        bytes = bytes_left;
@@ -580,7 +581,6 @@ static int read_rindex_entry(struct gfs2_inode *ip,
        rgd->rd_gl->gl_object = rgd;
        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
-        rgd->rd_flags |= GFS2_RDF_CHECK;
        return error;
 }
@@ -701,10 +701,9 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
        u32 rg_flags;
        rg_flags = be32_to_cpu(str->rg_flags);
-        if (rg_flags & GFS2_RGF_NOALLOC)
+        rg_flags &= ~GFS2_RDF_MASK;
-                rgd->rd_flags |= GFS2_RDF_NOALLOC;
+        rgd->rd_flags &= GFS2_RDF_MASK;
-        else
+        rgd->rd_flags |= rg_flags;
-                rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
        rgd->rd_free = be32_to_cpu(str->rg_free);
        rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
        rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -713,11 +712,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
        struct gfs2_rgrp *str = buf;
-        u32 rg_flags = 0;
-        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+        str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
-                rg_flags |= GFS2_RGF_NOALLOC;
-        str->rg_flags = cpu_to_be32(rg_flags);
        str->rg_free = cpu_to_be32(rgd->rd_free);
        str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
        str->__pad = cpu_to_be32(0);
@@ -775,8 +771,10 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
        }
        if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
+                for (x = 0; x < length; x++)
+                        clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
                gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
-                rgd->rd_flags |= GFS2_RDF_UPTODATE;
+                rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
        }
        spin_lock(&sdp->sd_rindex_spin);
@@ -845,7 +843,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
        struct super_block *sb = sdp->sd_vfs;
        struct block_device *bdev = sb->s_bdev;
        const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
-                                           bdev_hardsect_size(sb->s_bdev);
+                                           bdev_logical_block_size(sb->s_bdev);
        u64 blk;
        sector_t start = 0;
        sector_t nr_sects = 0;
@@ -903,6 +901,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
                        continue;
                if (sdp->sd_args.ar_discard)
                        gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
+                clear_bit(GBF_FULL, &bi->bi_flags);
                memcpy(bi->bi_clone + bi->bi_offset,
                       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
        }
@@ -942,7 +941,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
        struct gfs2_sbd *sdp = rgd->rd_sbd;
        int ret = 0;
-        if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+        if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
                return 0;
        spin_lock(&sdp->sd_rindex_spin);
@@ -1315,30 +1314,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 {
        struct gfs2_bitmap *bi = NULL;
        const u32 length = rgd->rd_length;
-        u32 blk = 0;
+        u32 blk = BFITNOENT;
        unsigned int buf, x;
        const unsigned int elen = *n;
-        const u8 *buffer;
+        const u8 *buffer = NULL;
        *n = 0;
        /* Find bitmap block that contains bits for goal block */
        for (buf = 0; buf < length; buf++) {
                bi = rgd->rd_bits + buf;
-                if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+                /* Convert scope of "goal" from rgrp-wide to within found bit block */
-                        break;
+                if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+                        goal -= bi->bi_start * GFS2_NBBY;
+                        goto do_search;
+                }
        }
+        buf = 0;
+        goal = 0;
-        gfs2_assert(rgd->rd_sbd, buf < length);
+do_search:
-        /* Convert scope of "goal" from rgrp-wide to within found bit block */
-        goal -= bi->bi_start * GFS2_NBBY;
        /* Search (up to entire) bitmap in this rgrp for allocatable block.
           "x <= length", instead of "x < length", because we typically start
           the search in the middle of a bit block, but if we can't find an
           allocatable block anywhere else, we want to be able wrap around and
           search in the first part of our first-searched bit block.  */
        for (x = 0; x <= length; x++) {
+                bi = rgd->rd_bits + buf;
+                if (test_bit(GBF_FULL, &bi->bi_flags) &&
+                    (old_state == GFS2_BLKST_FREE))
+                        goto skip;
                /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
                   bitmaps, so we must search the originals for that. */
                buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1349,33 +1355,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
                if (blk != BFITNOENT)
                        break;
+                if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
+                        set_bit(GBF_FULL, &bi->bi_flags);
                /* Try next bitmap block (wrap back to rgrp header if at end) */
-                buf = (buf + 1) % length;
+skip:
-                bi = rgd->rd_bits + buf;
+                buf++;
+                buf %= length;
                goal = 0;
        }
-        if (blk != BFITNOENT && old_state != new_state) {
+        if (blk == BFITNOENT)
-                *n = 1;
+                return blk;
-                gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+        *n = 1;
+        if (old_state == new_state)
+                goto out;
+        gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+        gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
+                    bi->bi_len, blk, new_state);
+        goal = blk;
+        while (*n < elen) {
+                goal++;
+                if (goal >= (bi->bi_len * GFS2_NBBY))
+                        break;
+                if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+                    GFS2_BLKST_FREE)
+                        break;
                gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-                            bi->bi_len, blk, new_state);
+                            bi->bi_len, goal, new_state);
-                goal = blk;
+                (*n)++;
-                while (*n < elen) {
-                        goal++;
-                        if (goal >= (bi->bi_len * GFS2_NBBY))
-                                break;
-                        if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
-                            GFS2_BLKST_FREE)
-                                break;
-                        gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
-                                    bi->bi_offset, bi->bi_len, goal,
-                                    new_state);
-                        (*n)++;
-                }
        }
+out:
-        return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
+        return (bi->bi_start * GFS2_NBBY) + blk;
 }
 /**
@@ -1435,13 +1447,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 }
 /**
- * gfs2_alloc_block - Allocate a block
+ * gfs2_rgrp_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+        const struct gfs2_rgrpd *rgd = gl->gl_object;
+        if (rgd == NULL)
+                return 0;
+        gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+                       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+                       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+        return 0;
+}
+/**
+ * gfs2_alloc_block - Allocate one or more blocks
 * @ip: the inode to allocate the block for
+ * @bn: Used to return the starting block number
+ * @n: requested number of blocks/extent length (value/result)
 *
- * Returns: the allocated block
+ * Returns: 0 or error
 */
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
+int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct buffer_head *dibh;
@@ -1457,7 +1489,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
                goal = rgd->rd_last_alloc;
        blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
-        BUG_ON(blk == BFITNOENT);
+        /* Since all blocks are reserved in advance, this shouldn't happen */
+        if (blk == BFITNOENT)
+                goto rgrp_error;
        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
@@ -1469,7 +1504,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
                di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
                brelse(dibh);
        }
-        gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
+        if (rgd->rd_free < *n)
+                goto rgrp_error;
        rgd->rd_free -= *n;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1483,8 +1520,17 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
        spin_lock(&sdp->sd_rindex_spin);
        rgd->rd_free_clone -= *n;
        spin_unlock(&sdp->sd_rindex_spin);
+        trace_gfs2_block_alloc(ip, block, *n, GFS2_BLKST_USED);
+        *bn = block;
+        return 0;
-        return block;
+rgrp_error:
+        fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+                (unsigned long long)rgd->rd_addr);
+        fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+        gfs2_rgrp_dump(NULL, rgd->rd_gl);
+        rgd->rd_flags |= GFS2_RDF_ERROR;
+        return -EIO;
 }
 /**
@@ -1526,7 +1572,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        spin_lock(&sdp->sd_rindex_spin);
        rgd->rd_free_clone--;
        spin_unlock(&sdp->sd_rindex_spin);
+        trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
        return block;
 }
@@ -1546,7 +1592,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
        rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
        if (!rgd)
                return;
+        trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1574,7 +1620,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
        rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
        if (!rgd)
                return;
+        trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE);
        rgd->rd_free += blen;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1597,6 +1643,7 @@ void gfs2_unlink_di(struct inode *inode)
        rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
        if (!rgd)
                return;
+        trace_gfs2_block_alloc(ip, blkno, 1, GFS2_BLKST_UNLINKED);
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_trans_add_rg(rgd);
@@ -1628,6 +1675,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
 void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 {
        gfs2_free_uninit_di(rgd, ip->i_no_addr);
+        trace_gfs2_block_alloc(ip, ip->i_no_addr, 1, GFS2_BLKST_FREE);
        gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
        gfs2_meta_wipe(ip, ip->i_no_addr, 1);
 }
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3181c7e624bf..1e76ff0f3e00 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -14,22 +14,22 @@ struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
-void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
-struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 {
        BUG_ON(ip->i_alloc == NULL);
@@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
        ip->i_alloc = NULL;
 }
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
-                         char *file, unsigned int line);
+                                  unsigned int line);
 #define gfs2_inplace_reserve(ip) \
 gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
-void gfs2_inplace_release(struct gfs2_inode *ip);
+extern void gfs2_inplace_release(struct gfs2_inode *ip);
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
+extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
-void gfs2_unlink_di(struct inode *inode);
+extern void gfs2_unlink_di(struct inode *inode);
 struct gfs2_rgrp_list {
        unsigned int rl_rgrps;
@@ -61,10 +61,11 @@ struct gfs2_rgrp_list {
        struct gfs2_holder *rl_ghs;
 };
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
-                    u64 block);
+                           u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
-void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
-u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 601913e0a482..0a6801336470 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,14 +7,20 @@
 * of the GNU General Public License version 2.
 */
+#include <linux/bio.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/crc32.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/bio.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -31,6 +37,183 @@
 #include "super.h"
 #include "trans.h"
 #include "util.h"
+#include "sys.h"
+#include "eattr.h"
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+enum {
+        Opt_lockproto,
+        Opt_locktable,
+        Opt_hostdata,
+        Opt_spectator,
+        Opt_ignore_local_fs,
+        Opt_localflocks,
+        Opt_localcaching,
+        Opt_debug,
+        Opt_nodebug,
+        Opt_upgrade,
+        Opt_acl,
+        Opt_noacl,
+        Opt_quota_off,
+        Opt_quota_account,
+        Opt_quota_on,
+        Opt_quota,
+        Opt_noquota,
+        Opt_suiddir,
+        Opt_nosuiddir,
+        Opt_data_writeback,
+        Opt_data_ordered,
+        Opt_meta,
+        Opt_discard,
+        Opt_nodiscard,
+        Opt_commit,
+        Opt_error,
+};
+static const match_table_t tokens = {
+        {Opt_lockproto, "lockproto=%s"},
+        {Opt_locktable, "locktable=%s"},
+        {Opt_hostdata, "hostdata=%s"},
+        {Opt_spectator, "spectator"},
+        {Opt_ignore_local_fs, "ignore_local_fs"},
+        {Opt_localflocks, "localflocks"},
+        {Opt_localcaching, "localcaching"},
+        {Opt_debug, "debug"},
+        {Opt_nodebug, "nodebug"},
+        {Opt_upgrade, "upgrade"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_quota_off, "quota=off"},
+        {Opt_quota_account, "quota=account"},
+        {Opt_quota_on, "quota=on"},
+        {Opt_quota, "quota"},
+        {Opt_noquota, "noquota"},
+        {Opt_suiddir, "suiddir"},
+        {Opt_nosuiddir, "nosuiddir"},
+        {Opt_data_writeback, "data=writeback"},
+        {Opt_data_ordered, "data=ordered"},
+        {Opt_meta, "meta"},
+        {Opt_discard, "discard"},
+        {Opt_nodiscard, "nodiscard"},
+        {Opt_commit, "commit=%d"},
+        {Opt_error, NULL}
+};
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+{
+        char *o;
+        int token;
+        substring_t tmp[MAX_OPT_ARGS];
+        int rv;
+        /* Split the options into tokens with the "," character and
+           process them */
+        while (1) {
+                o = strsep(&options, ",");
+                if (o == NULL)
+                        break;
+                if (*o == '\0')
+                        continue;
+                token = match_token(o, tokens, tmp);
+                switch (token) {
+                case Opt_lockproto:
+                        match_strlcpy(args->ar_lockproto, &tmp[0],
+                                      GFS2_LOCKNAME_LEN);
+                        break;
+                case Opt_locktable:
+                        match_strlcpy(args->ar_locktable, &tmp[0],
+                                      GFS2_LOCKNAME_LEN);
+                        break;
+                case Opt_hostdata:
+                        match_strlcpy(args->ar_hostdata, &tmp[0],
+                                      GFS2_LOCKNAME_LEN);
+                        break;
+                case Opt_spectator:
+                        args->ar_spectator = 1;
+                        break;
+                case Opt_ignore_local_fs:
+                        args->ar_ignore_local_fs = 1;
+                        break;
+                case Opt_localflocks:
+                        args->ar_localflocks = 1;
+                        break;
+                case Opt_localcaching:
+                        args->ar_localcaching = 1;
+                        break;
+                case Opt_debug:
+                        args->ar_debug = 1;
+                        break;
+                case Opt_nodebug:
+                        args->ar_debug = 0;
+                        break;
+                case Opt_upgrade:
+                        args->ar_upgrade = 1;
+                        break;
+                case Opt_acl:
+                        args->ar_posix_acl = 1;
+                        break;
+                case Opt_noacl:
+                        args->ar_posix_acl = 0;
+                        break;
+                case Opt_quota_off:
+                case Opt_noquota:
+                        args->ar_quota = GFS2_QUOTA_OFF;
+                        break;
+                case Opt_quota_account:
+                        args->ar_quota = GFS2_QUOTA_ACCOUNT;
+                        break;
+                case Opt_quota_on:
+                case Opt_quota:
+                        args->ar_quota = GFS2_QUOTA_ON;
+                        break;
+                case Opt_suiddir:
+                        args->ar_suiddir = 1;
+                        break;
+                case Opt_nosuiddir:
+                        args->ar_suiddir = 0;
+                        break;
+                case Opt_data_writeback:
+                        args->ar_data = GFS2_DATA_WRITEBACK;
+                        break;
+                case Opt_data_ordered:
+                        args->ar_data = GFS2_DATA_ORDERED;
+                        break;
+                case Opt_meta:
+                        args->ar_meta = 1;
+                        break;
+                case Opt_discard:
+                        args->ar_discard = 1;
+                        break;
+                case Opt_nodiscard:
+                        args->ar_discard = 0;
+                        break;
+                case Opt_commit:
+                        rv = match_int(&tmp[0], &args->ar_commit);
+                        if (rv || args->ar_commit <= 0) {
+                                fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+                                return rv ? rv : -EINVAL;
+                        }
+                        break;
+                case Opt_error:
+                default:
+                        fs_info(sdp, "invalid mount option: %s\n", o);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
 /**
 * gfs2_jindex_free - Clear all the journal index information
@@ -436,3 +619,706 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
        mutex_unlock(&sdp->sd_freeze_lock);
 }
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_holder gh;
+        struct buffer_head *bh;
+        struct timespec atime;
+        struct gfs2_dinode *di;
+        int ret = 0;
+        /* Check this is a "normal" inode, etc */
+        if (!test_bit(GIF_USER, &ip->i_flags) ||
+            (current->flags & PF_MEMALLOC))
+                return 0;
+        ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (ret)
+                goto do_flush;
+        ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+        if (ret)
+                goto do_unlock;
+        ret = gfs2_meta_inode_buffer(ip, &bh);
+        if (ret == 0) {
+                di = (struct gfs2_dinode *)bh->b_data;
+                atime.tv_sec = be64_to_cpu(di->di_atime);
+                atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+                if (timespec_compare(&inode->i_atime, &atime) > 0) {
+                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        gfs2_dinode_out(ip, bh->b_data);
+                }
+                brelse(bh);
+        }
+        gfs2_trans_end(sdp);
+do_unlock:
+        gfs2_glock_dq_uninit(&gh);
+do_flush:
+        if (sync != 0)
+                gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+        return ret;
+}
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+        struct gfs2_holder t_gh;
+        int error;
+        gfs2_quota_sync(sdp);
+        gfs2_statfs_sync(sdp);
+        error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+                                   &t_gh);
+        if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return error;
+        gfs2_meta_syncfs(sdp);
+        gfs2_log_shutdown(sdp);
+        clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+        if (t_gh.gh_gl)
+                gfs2_glock_dq_uninit(&t_gh);
+        gfs2_quota_cleanup(sdp);
+        return error;
+}
+static int gfs2_umount_recovery_wait(void *word)
+{
+        schedule();
+        return 0;
+}
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+static void gfs2_put_super(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        struct gfs2_jdesc *jd;
+        /*  Unfreeze the filesystem, if we need to  */
+        mutex_lock(&sdp->sd_freeze_lock);
+        if (sdp->sd_freeze_count)
+                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+        mutex_unlock(&sdp->sd_freeze_lock);
+        /* No more recovery requests */
+        set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+        smp_mb();
+        /* Wait on outstanding recovery */
+restart:
+        spin_lock(&sdp->sd_jindex_spin);
+        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+                if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+                        continue;
+                spin_unlock(&sdp->sd_jindex_spin);
+                wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+                            gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+                goto restart;
+        }
+        spin_unlock(&sdp->sd_jindex_spin);
+        kthread_stop(sdp->sd_quotad_process);
+        kthread_stop(sdp->sd_logd_process);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                error = gfs2_make_fs_ro(sdp);
+                if (error)
+                        gfs2_io_error(sdp);
+        }
+        /*  At this point, we're through modifying the disk  */
+        /*  Release stuff  */
+        iput(sdp->sd_jindex);
+        iput(sdp->sd_inum_inode);
+        iput(sdp->sd_statfs_inode);
+        iput(sdp->sd_rindex);
+        iput(sdp->sd_quota_inode);
+        gfs2_glock_put(sdp->sd_rename_gl);
+        gfs2_glock_put(sdp->sd_trans_gl);
+        if (!sdp->sd_args.ar_spectator) {
+                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+                iput(sdp->sd_ir_inode);
+                iput(sdp->sd_sc_inode);
+                iput(sdp->sd_qc_inode);
+        }
+        gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+        gfs2_clear_rgrpd(sdp);
+        gfs2_jindex_free(sdp);
+        /*  Take apart glock structures and buffer lists  */
+        gfs2_gl_hash_clear(sdp);
+        /*  Unmount the locking protocol  */
+        gfs2_lm_unmount(sdp);
+        /*  At this point, we're through participating in the lockspace  */
+        gfs2_sys_fs_del(sdp);
+}
+/**
+ * gfs2_sync_fs - sync the filesystem
+ * @sb: the superblock
+ *
+ * Flushes the log to disk.
+ */
+static int gfs2_sync_fs(struct super_block *sb, int wait)
+{
+        if (wait && sb->s_fs_info)
+                gfs2_log_flush(sb->s_fs_info, NULL);
+        return 0;
+}
+/**
+ * gfs2_freeze - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static int gfs2_freeze(struct super_block *sb)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        int error;
+        if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+                return -EINVAL;
+        for (;;) {
+                error = gfs2_freeze_fs(sdp);
+                if (!error)
+                        break;
+                switch (error) {
+                case -EBUSY:
+                        fs_err(sdp, "waiting for recovery before freeze\n");
+                        break;
+                default:
+                        fs_err(sdp, "error freezing FS: %d\n", error);
+                        break;
+                }
+                fs_err(sdp, "retrying...\n");
+                msleep(1000);
+        }
+        return 0;
+}
+/**
+ * gfs2_unfreeze - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+static int gfs2_unfreeze(struct super_block *sb)
+{
+        gfs2_unfreeze_fs(sb->s_fs_info);
+        return 0;
+}
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+                            struct gfs2_statfs_change_host *sc)
+{
+        gfs2_rgrp_verify(rgd);
+        sc->sc_total += rgd->rd_data;
+        sc->sc_free += rgd->rd_free;
+        sc->sc_dinodes += rgd->rd_dinodes;
+        return 0;
+}
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_holder ri_gh;
+        struct gfs2_rgrpd *rgd_next;
+        struct gfs2_holder *gha, *gh;
+        unsigned int slots = 64;
+        unsigned int x;
+        int done;
+        int error = 0, err;
+        memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+        gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+        if (!gha)
+                return -ENOMEM;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto out;
+        rgd_next = gfs2_rgrpd_get_first(sdp);
+        for (;;) {
+                done = 1;
+                for (x = 0; x < slots; x++) {
+                        gh = gha + x;
+                        if (gh->gh_gl && gfs2_glock_poll(gh)) {
+                                err = gfs2_glock_wait(gh);
+                                if (err) {
+                                        gfs2_holder_uninit(gh);
+                                        error = err;
+                                } else {
+                                        if (!error)
+                                                error = statfs_slow_fill(
+                                                        gh->gh_gl->gl_object, sc);
+                                        gfs2_glock_dq_uninit(gh);
+                                }
+                        }
+                        if (gh->gh_gl)
+                                done = 0;
+                        else if (rgd_next && !error) {
+                                error = gfs2_glock_nq_init(rgd_next->rd_gl,
+                                                           LM_ST_SHARED,
+                                                           GL_ASYNC,
+                                                           gh);
+                                rgd_next = gfs2_rgrpd_get_next(rgd_next);
+                                done = 0;
+                        }
+                        if (signal_pending(current))
+                                error = -ERESTARTSYS;
+                }
+                if (done)
+                        break;
+                yield();
+        }
+        gfs2_glock_dq_uninit(&ri_gh);
+out:
+        kfree(gha);
+        return error;
+}
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        spin_lock(&sdp->sd_statfs_spin);
+        *sc = *m_sc;
+        sc->sc_total += l_sc->sc_total;
+        sc->sc_free += l_sc->sc_free;
+        sc->sc_dinodes += l_sc->sc_dinodes;
+        spin_unlock(&sdp->sd_statfs_spin);
+        if (sc->sc_free < 0)
+                sc->sc_free = 0;
+        if (sc->sc_free > sc->sc_total)
+                sc->sc_free = sc->sc_total;
+        if (sc->sc_dinodes < 0)
+                sc->sc_dinodes = 0;
+        return 0;
+}
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_inode->i_sb;
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_statfs_change_host sc;
+        int error;
+        if (gfs2_tune_get(sdp, gt_statfs_slow))
+                error = gfs2_statfs_slow(sdp, &sc);
+        else
+                error = gfs2_statfs_i(sdp, &sc);
+        if (error)
+                return error;
+        buf->f_type = GFS2_MAGIC;
+        buf->f_bsize = sdp->sd_sb.sb_bsize;
+        buf->f_blocks = sc.sc_total;
+        buf->f_bfree = sc.sc_free;
+        buf->f_bavail = sc.sc_free;
+        buf->f_files = sc.sc_dinodes + sc.sc_free;
+        buf->f_ffree = sc.sc_free;
+        buf->f_namelen = GFS2_FNAMESIZE;
+        return 0;
+}
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+        struct gfs2_tune *gt = &sdp->sd_tune;
+        int error;
+        spin_lock(&gt->gt_spin);
+        args.ar_commit = gt->gt_log_flush_secs;
+        spin_unlock(&gt->gt_spin);
+        error = gfs2_mount_args(sdp, &args, data);
+        if (error)
+                return error;
+        /* Not allowed to change locking details */
+        if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+            strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+            strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+                return -EINVAL;
+        /* Some flags must not be changed */
+        if (args_neq(&args, &sdp->sd_args, spectator) ||
+            args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
+            args_neq(&args, &sdp->sd_args, localflocks) ||
+            args_neq(&args, &sdp->sd_args, localcaching) ||
+            args_neq(&args, &sdp->sd_args, meta))
+                return -EINVAL;
+        if (sdp->sd_args.ar_spectator)
+                *flags |= MS_RDONLY;
+        if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+                if (*flags & MS_RDONLY)
+                        error = gfs2_make_fs_ro(sdp);
+                else
+                        error = gfs2_make_fs_rw(sdp);
+                if (error)
+                        return error;
+        }
+        sdp->sd_args = args;
+        if (sdp->sd_args.ar_posix_acl)
+                sb->s_flags |= MS_POSIXACL;
+        else
+                sb->s_flags &= ~MS_POSIXACL;
+        spin_lock(&gt->gt_spin);
+        gt->gt_log_flush_secs = args.ar_commit;
+        spin_unlock(&gt->gt_spin);
+        return 0;
+}
+/**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+static void gfs2_drop_inode(struct inode *inode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
+                struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+                if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+                        clear_nlink(inode);
+        }
+        generic_drop_inode(inode);
+}
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+static void gfs2_clear_inode(struct inode *inode)
+{
+        struct gfs2_inode *ip = GFS2_I(inode);
+        /* This tells us its a "real" inode and not one which only
+         * serves to contain an address space (see rgrp.c, meta_io.c)
+         * which therefore doesn't have its own glocks.
+         */
+        if (test_bit(GIF_USER, &ip->i_flags)) {
+                ip->i_gl->gl_object = NULL;
+                gfs2_glock_put(ip->i_gl);
+                ip->i_gl = NULL;
+                if (ip->i_iopen_gh.gh_gl) {
+                        ip->i_iopen_gh.gh_gl->gl_object = NULL;
+                        gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+                }
+        }
+}
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+        do {
+                if (d1 == d2)
+                        return 1;
+                d1 = d1->d_parent;
+        } while (!IS_ROOT(d1));
+        return 0;
+}
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+        struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+        struct gfs2_args *args = &sdp->sd_args;
+        int lfsecs;
+        if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+                seq_printf(s, ",meta");
+        if (args->ar_lockproto[0])
+                seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+        if (args->ar_locktable[0])
+                seq_printf(s, ",locktable=%s", args->ar_locktable);
+        if (args->ar_hostdata[0])
+                seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+        if (args->ar_spectator)
+                seq_printf(s, ",spectator");
+        if (args->ar_ignore_local_fs)
+                seq_printf(s, ",ignore_local_fs");
+        if (args->ar_localflocks)
+                seq_printf(s, ",localflocks");
+        if (args->ar_localcaching)
+                seq_printf(s, ",localcaching");
+        if (args->ar_debug)
+                seq_printf(s, ",debug");
+        if (args->ar_upgrade)
+                seq_printf(s, ",upgrade");
+        if (args->ar_posix_acl)
+                seq_printf(s, ",acl");
+        if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+                char *state;
+                switch (args->ar_quota) {
+                case GFS2_QUOTA_OFF:
+                        state = "off";
+                        break;
+                case GFS2_QUOTA_ACCOUNT:
+                        state = "account";
+                        break;
+                case GFS2_QUOTA_ON:
+                        state = "on";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",quota=%s", state);
+        }
+        if (args->ar_suiddir)
+                seq_printf(s, ",suiddir");
+        if (args->ar_data != GFS2_DATA_DEFAULT) {
+                char *state;
+                switch (args->ar_data) {
+                case GFS2_DATA_WRITEBACK:
+                        state = "writeback";
+                        break;
+                case GFS2_DATA_ORDERED:
+                        state = "ordered";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",data=%s", state);
+        }
+        if (args->ar_discard)
+                seq_printf(s, ",discard");
+        lfsecs = sdp->sd_tune.gt_log_flush_secs;
+        if (lfsecs != 60)
+                seq_printf(s, ",commit=%d", lfsecs);
+        return 0;
+}
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+static void gfs2_delete_inode(struct inode *inode)
+{
+        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
+        int error;
+        if (!test_bit(GIF_USER, &ip->i_flags))
+                goto out;
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        if (unlikely(error)) {
+                gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+                goto out;
+        }
+        gfs2_glock_dq_wait(&ip->i_iopen_gh);
+        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+        error = gfs2_glock_nq(&ip->i_iopen_gh);
+        if (error)
+                goto out_truncate;
+        if (S_ISDIR(inode->i_mode) &&
+            (ip->i_diskflags & GFS2_DIF_EXHASH)) {
+                error = gfs2_dir_exhash_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (ip->i_eattr) {
+                error = gfs2_ea_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        if (!gfs2_is_stuffed(ip)) {
+                error = gfs2_file_dealloc(ip);
+                if (error)
+                        goto out_unlock;
+        }
+        error = gfs2_dinode_dealloc(ip);
+        if (error)
+                goto out_unlock;
+out_truncate:
+        error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+        if (error)
+                goto out_unlock;
+        /* Needs to be done before glock release & also in a transaction */
+        truncate_inode_pages(&inode->i_data, 0);
+        gfs2_trans_end(sdp);
+out_unlock:
+        if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+                gfs2_glock_dq(&ip->i_iopen_gh);
+        gfs2_holder_uninit(&ip->i_iopen_gh);
+        gfs2_glock_dq_uninit(&gh);
+        if (error && error != GLR_TRYFAILED && error != -EROFS)
+                fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+        truncate_inode_pages(&inode->i_data, 0);
+        clear_inode(inode);
+}
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+        struct gfs2_inode *ip;
+        ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+        if (ip) {
+                ip->i_flags = 0;
+                ip->i_gl = NULL;
+        }
+        return &ip->i_inode;
+}
+static void gfs2_destroy_inode(struct inode *inode)
+{
+        kmem_cache_free(gfs2_inode_cachep, inode);
+}
+const struct super_operations gfs2_super_ops = {
+        .alloc_inode            = gfs2_alloc_inode,
+        .destroy_inode          = gfs2_destroy_inode,
+        .write_inode            = gfs2_write_inode,
+        .delete_inode           = gfs2_delete_inode,
+        .put_super              = gfs2_put_super,
+        .sync_fs                = gfs2_sync_fs,
+        .freeze_fs              = gfs2_freeze,
+        .unfreeze_fs            = gfs2_unfreeze,
+        .statfs                 = gfs2_statfs,
+        .remount_fs             = gfs2_remount_fs,
+        .clear_inode            = gfs2_clear_inode,
+        .drop_inode             = gfs2_drop_inode,
+        .show_options           = gfs2_show_options,
+};
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7655f5025fec..23419dc3027b 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,6 +26,36 @@
 #include "util.h"
 #include "glops.h"
+struct gfs2_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct gfs2_sbd *, char *);
+        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+                              char *buf)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->show ? a->show(sdp, buf) : 0;
+}
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+        return a->store ? a->store(sdp, buf, len) : len;
+}
+static struct sysfs_ops gfs2_attr_ops = {
+        .show  = gfs2_attr_show,
+        .store = gfs2_attr_store,
+};
+static struct kset *gfs2_kset;
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
        return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
        return len;
 }
-struct gfs2_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
 #define GFS2_ATTR(name, mode, show, store) \
 static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
@@ -246,58 +271,11 @@ static struct attribute *gfs2_attrs[] = {
        NULL,
 };
-static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
-                              char *buf)
-{
-        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-        return a->show ? a->show(sdp, buf) : 0;
-}
-static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
-                               const char *buf, size_t len)
-{
-        struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-        struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-        return a->store ? a->store(sdp, buf, len) : len;
-}
-static struct sysfs_ops gfs2_attr_ops = {
-        .show  = gfs2_attr_show,
-        .store = gfs2_attr_store,
-};
 static struct kobj_type gfs2_ktype = {
        .default_attrs = gfs2_attrs,
        .sysfs_ops     = &gfs2_attr_ops,
 };
-static struct kset *gfs2_kset;
-/*
- * display struct lm_lockstruct fields
- */
-struct lockstruct_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-#define LOCKSTRUCT_ATTR(name, fmt)                                          \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
-}                                                                           \
-static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
-LOCKSTRUCT_ATTR(jid,      "%u\n");
-LOCKSTRUCT_ATTR(first,    "%u\n");
-static struct attribute *lockstruct_attrs[] = {
-        &lockstruct_attr_jid.attr,
-        &lockstruct_attr_first.attr,
-        NULL,
-};
 /*
 * lock_module. Originally from lock_dlm
@@ -359,34 +337,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%d\n", ls->ls_first_done);
 }
-static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        return sprintf(buf, "%d\n", ls->ls_recover_jid);
-}
-static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
 {
+        unsigned jid;
        struct gfs2_jdesc *jd;
+        int rv;
+        rv = sscanf(buf, "%u", &jid);
+        if (rv != 1)
+                return -EINVAL;
+        rv = -ESHUTDOWN;
        spin_lock(&sdp->sd_jindex_spin);
+        if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+                goto out;
+        rv = -EBUSY;
+        if (sdp->sd_jdesc->jd_jid == jid)
+                goto out;
+        rv = -ENOENT;
        list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
                if (jd->jd_jid != jid)
                        continue;
-                jd->jd_dirty = 1;
+                rv = slow_work_enqueue(&jd->jd_work);
                break;
        }
+out:
        spin_unlock(&sdp->sd_jindex_spin);
-}
+        return rv ? rv : len;
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
-        gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
-        if (sdp->sd_recoverd_process)
-                wake_up_process(sdp->sd_recoverd_process);
-        return len;
 }
 static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
@@ -401,31 +378,31 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
        return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
 }
-struct gdlm_attr {
+static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
-        struct attribute attr;
+{
-        ssize_t (*show)(struct gfs2_sbd *sdp, char *);
+        return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
-        ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
+}
-};
 #define GDLM_ATTR(_name,_mode,_show,_store) \
-static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
+GDLM_ATTR(proto_name,     0444, proto_name_show,        NULL);
-GDLM_ATTR(block,          0644, block_show,          block_store);
+GDLM_ATTR(block,          0644, block_show,             block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,          withdraw_store);
-GDLM_ATTR(id,             0444, lkid_show,           NULL);
+GDLM_ATTR(id,             0444, lkid_show,              NULL);
-GDLM_ATTR(first,          0444, lkfirst_show,        NULL);
+GDLM_ATTR(jid,            0444, jid_show,               NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
+GDLM_ATTR(first,          0444, lkfirst_show,           NULL);
-GDLM_ATTR(recover,        0644, recover_show,        recover_store);
+GDLM_ATTR(first_done,     0444, first_done_show,        NULL);
-GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
+GDLM_ATTR(recover,        0200, NULL,                   recover_store);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+GDLM_ATTR(recover_done,   0444, recover_done_show,      NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show,    NULL);
 static struct attribute *lock_module_attrs[] = {
        &gdlm_attr_proto_name.attr,
        &gdlm_attr_block.attr,
        &gdlm_attr_withdraw.attr,
        &gdlm_attr_id.attr,
-        &lockstruct_attr_jid.attr,
+        &gdlm_attr_jid.attr,
        &gdlm_attr_first.attr,
        &gdlm_attr_first_done.attr,
        &gdlm_attr_recover.attr,
@@ -435,53 +412,6 @@ static struct attribute *lock_module_attrs[] = {
 };
 /*
- * display struct gfs2_args fields
- */
-struct args_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-#define ARGS_ATTR(name, fmt)                                                \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-        return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
-}                                                                           \
-static struct args_attr args_attr_##name = __ATTR_RO(name)
-ARGS_ATTR(lockproto,       "%s\n");
-ARGS_ATTR(locktable,       "%s\n");
-ARGS_ATTR(hostdata,        "%s\n");
-ARGS_ATTR(spectator,       "%d\n");
-ARGS_ATTR(ignore_local_fs, "%d\n");
-ARGS_ATTR(localcaching,    "%d\n");
-ARGS_ATTR(localflocks,     "%d\n");
-ARGS_ATTR(debug,           "%d\n");
-ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(posix_acl,       "%d\n");
-ARGS_ATTR(quota,           "%u\n");
-ARGS_ATTR(suiddir,         "%d\n");
-ARGS_ATTR(data,            "%d\n");
-static struct attribute *args_attrs[] = {
-        &args_attr_lockproto.attr,
-        &args_attr_locktable.attr,
-        &args_attr_hostdata.attr,
-        &args_attr_spectator.attr,
-        &args_attr_ignore_local_fs.attr,
-        &args_attr_localcaching.attr,
-        &args_attr_localflocks.attr,
-        &args_attr_debug.attr,
-        &args_attr_upgrade.attr,
-        &args_attr_posix_acl.attr,
-        &args_attr_quota.attr,
-        &args_attr_suiddir.attr,
-        &args_attr_data.attr,
-        NULL,
-};
-/*
 * get and set struct gfs2_tune fields
 */
@@ -531,14 +461,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
        return len;
 }
-struct tune_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct gfs2_sbd *, char *);
-        ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
 #define TUNE_ATTR_3(name, show, store)                                        \
-static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
 #define TUNE_ATTR_2(name, store)                                              \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
@@ -554,15 +478,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
-#define TUNE_ATTR_DAEMON(name, process)                                       \
-static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
-{                                                                             \
-        ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len);      \
-        wake_up_process(sdp->sd_##process);                                   \
-        return r;                                                             \
-}                                                                             \
-TUNE_ATTR_2(name, name##_store)
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -574,8 +489,6 @@ TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
-TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
-TUNE_ATTR_DAEMON(logd_secs, logd_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 static struct attribute *tune_attrs[] = {
@@ -589,23 +502,11 @@ static struct attribute *tune_attrs[] = {
        &tune_attr_quota_simul_sync.attr,
        &tune_attr_stall_secs.attr,
        &tune_attr_statfs_quantum.attr,
-        &tune_attr_recoverd_secs.attr,
-        &tune_attr_logd_secs.attr,
        &tune_attr_quota_scale.attr,
        &tune_attr_new_files_jdata.attr,
        NULL,
 };
-static struct attribute_group lockstruct_group = {
-        .name = "lockstruct",
-        .attrs = lockstruct_attrs,
-};
-static struct attribute_group args_group = {
-        .name = "args",
-        .attrs = args_attrs,
-};
 static struct attribute_group tune_group = {
        .name = "tune",
        .attrs = tune_attrs,
@@ -626,17 +527,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
        if (error)
                goto fail;
-        error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
-        if (error)
-                goto fail_reg;
-        error = sysfs_create_group(&sdp->sd_kobj, &args_group);
-        if (error)
-                goto fail_lockstruct;
        error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
        if (error)
-                goto fail_args;
+                goto fail_reg;
        error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
        if (error)
@@ -647,10 +540,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 fail_tune:
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-fail_args:
-        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_lockstruct:
-        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
        kobject_put(&sdp->sd_kobj);
 fail:
@@ -661,8 +550,6 @@ fail:
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-        sysfs_remove_group(&sdp->sd_kobj, &args_group);
-        sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
        sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
        kobject_put(&sdp->sd_kobj);
 }
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
new file mode 100644
index 000000000000..98d6ef1c1dc0
--- /dev/null
+++ b/fs/gfs2/trace_gfs2.h
@@ -0,0 +1,407 @@
+#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_GFS2_H
+#include <linux/tracepoint.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gfs2
+#define TRACE_INCLUDE_FILE trace_gfs2
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/dlmconstants.h>
+#include <linux/gfs2_ondisk.h>
+#include "incore.h"
+#include "glock.h"
+#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
+#define glock_trace_name(x) __print_symbolic(x,         \
+                            dlm_state_name(IV),         \
+                            dlm_state_name(NL),         \
+                            dlm_state_name(CR),         \
+                            dlm_state_name(CW),         \
+                            dlm_state_name(PR),         \
+                            dlm_state_name(PW),         \
+                            dlm_state_name(EX))
+#define block_state_name(x) __print_symbolic(x,                 \
+                            { GFS2_BLKST_FREE, "free" },        \
+                            { GFS2_BLKST_USED, "used" },        \
+                            { GFS2_BLKST_DINODE, "dinode" },    \
+                            { GFS2_BLKST_UNLINKED, "unlinked" })
+#define show_glock_flags(flags) __print_flags(flags, "",        \
+        {(1UL << GLF_LOCK),                     "l" },          \
+        {(1UL << GLF_DEMOTE),                   "D" },          \
+        {(1UL << GLF_PENDING_DEMOTE),           "d" },          \
+        {(1UL << GLF_DEMOTE_IN_PROGRESS),       "p" },          \
+        {(1UL << GLF_DIRTY),                    "y" },          \
+        {(1UL << GLF_LFLUSH),                   "f" },          \
+        {(1UL << GLF_INVALIDATE_IN_PROGRESS),   "i" },          \
+        {(1UL << GLF_REPLY_PENDING),            "r" },          \
+        {(1UL << GLF_INITIAL),                  "I" },          \
+        {(1UL << GLF_FROZEN),                   "F" })
+#ifndef NUMPTY
+#define NUMPTY
+static inline u8 glock_trace_state(unsigned int state)
+{
+        switch(state) {
+        case LM_ST_SHARED:
+                return DLM_LOCK_PR;
+        case LM_ST_DEFERRED:
+                return DLM_LOCK_CW;
+        case LM_ST_EXCLUSIVE:
+                return DLM_LOCK_EX;
+        }
+        return DLM_LOCK_NL;
+}
+#endif
+/* Section 1 - Locking
+ *
+ * Objectives:
+ * Latency: Remote demote request to state change
+ * Latency: Local lock request to state change
+ * Latency: State change to lock grant
+ * Correctness: Ordering of local lock state vs. I/O requests
+ * Correctness: Responses to remote demote requests
+ */
+/* General glock state change (DLM lock request completes) */
+TRACE_EVENT(gfs2_glock_state_change,
+        TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state),
+        TP_ARGS(gl, new_state),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        u8,     cur_state               )
+                __field(        u8,     new_state               )
+                __field(        u8,     dmt_state               )
+                __field(        u8,     tgt_state               )
+                __field(        unsigned long,  flags           )
+        ),
+        TP_fast_assign(
+                __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+                __entry->glnum          = gl->gl_name.ln_number;
+                __entry->gltype         = gl->gl_name.ln_type;
+                __entry->cur_state      = glock_trace_state(gl->gl_state);
+                __entry->new_state      = glock_trace_state(new_state);
+                __entry->tgt_state      = glock_trace_state(gl->gl_target);
+                __entry->dmt_state      = glock_trace_state(gl->gl_demote_state);
+                __entry->flags          = gl->gl_flags;
+        ),
+        TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+                 (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+                  glock_trace_name(__entry->new_state),
+                  glock_trace_name(__entry->tgt_state),
+                  glock_trace_name(__entry->dmt_state),
+                  show_glock_flags(__entry->flags))
+);
+/* State change -> unlocked, glock is being deallocated */
+TRACE_EVENT(gfs2_glock_put,
+        TP_PROTO(const struct gfs2_glock *gl),
+        TP_ARGS(gl),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        u8,     cur_state               )
+                __field(        unsigned long,  flags           )
+        ),
+        TP_fast_assign(
+                __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+                __entry->gltype         = gl->gl_name.ln_type;
+                __entry->glnum          = gl->gl_name.ln_number;
+                __entry->cur_state      = glock_trace_state(gl->gl_state);
+                __entry->flags          = gl->gl_flags;
+        ),
+        TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->gltype, (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+                  glock_trace_name(DLM_LOCK_IV),
+                  show_glock_flags(__entry->flags))
+);
+/* Callback (local or remote) requesting lock demotion */
+TRACE_EVENT(gfs2_demote_rq,
+        TP_PROTO(const struct gfs2_glock *gl),
+        TP_ARGS(gl),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        u8,     cur_state               )
+                __field(        u8,     dmt_state               )
+                __field(        unsigned long,  flags           )
+        ),
+        TP_fast_assign(
+                __entry->dev            = gl->gl_sbd->sd_vfs->s_dev;
+                __entry->gltype         = gl->gl_name.ln_type;
+                __entry->glnum          = gl->gl_name.ln_number;
+                __entry->cur_state      = glock_trace_state(gl->gl_state);
+                __entry->dmt_state      = glock_trace_state(gl->gl_demote_state);
+                __entry->flags          = gl->gl_flags;
+        ),
+        TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+                  (unsigned long long)__entry->glnum,
+                  glock_trace_name(__entry->cur_state),
+                  glock_trace_name(__entry->dmt_state),
+                  show_glock_flags(__entry->flags))
+);
+/* Promotion/grant of a glock */
+TRACE_EVENT(gfs2_promote,
+        TP_PROTO(const struct gfs2_holder *gh, int first),
+        TP_ARGS(gh, first),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        int,    first                   )
+                __field(        u8,     state                   )
+        ),
+        TP_fast_assign(
+                __entry->dev    = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->glnum  = gh->gh_gl->gl_name.ln_number;
+                __entry->gltype = gh->gh_gl->gl_name.ln_type;
+                __entry->first  = first;
+                __entry->state  = glock_trace_state(gh->gh_state);
+        ),
+        TP_printk("%u,%u glock %u:%llu promote %s %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+                  (unsigned long long)__entry->glnum,
+                  __entry->first ? "first": "other",
+                  glock_trace_name(__entry->state))
+);
+/* Queue/dequeue a lock request */
+TRACE_EVENT(gfs2_glock_queue,
+        TP_PROTO(const struct gfs2_holder *gh, int queue),
+        TP_ARGS(gh, queue),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    glnum                   )
+                __field(        u32,    gltype                  )
+                __field(        int,    queue                   )
+                __field(        u8,     state                   )
+        ),
+        TP_fast_assign(
+                __entry->dev    = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->glnum  = gh->gh_gl->gl_name.ln_number;
+                __entry->gltype = gh->gh_gl->gl_name.ln_type;
+                __entry->queue  = queue;
+                __entry->state  = glock_trace_state(gh->gh_state);
+        ),
+        TP_printk("%u,%u glock %u:%llu %squeue %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype,
+                  (unsigned long long)__entry->glnum,
+                  __entry->queue ? "" : "de",
+                  glock_trace_name(__entry->state))
+);
+/* Section 2 - Log/journal
+ *
+ * Objectives:
+ * Latency: Log flush time
+ * Correctness: pin/unpin vs. disk I/O ordering
+ * Performance: Log usage stats
+ */
+/* Pin/unpin a block in the log */
+TRACE_EVENT(gfs2_pin,
+        TP_PROTO(const struct gfs2_bufdata *bd, int pin),
+        TP_ARGS(bd, pin),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        int,    pin                     )
+                __field(        u32,    len                     )
+                __field(        sector_t,       block           )
+                __field(        u64,    ino                     )
+        ),
+        TP_fast_assign(
+                __entry->dev            = bd->bd_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->pin            = pin;
+                __entry->len            = bd->bd_bh->b_size;
+                __entry->block          = bd->bd_bh->b_blocknr;
+                __entry->ino            = bd->bd_gl->gl_name.ln_number;
+        ),
+        TP_printk("%u,%u log %s %llu/%lu inode %llu",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->pin ? "pin" : "unpin",
+                  (unsigned long long)__entry->block,
+                  (unsigned long)__entry->len,
+                  (unsigned long long)__entry->ino)
+);
+/* Flushing the log */
+TRACE_EVENT(gfs2_log_flush,
+        TP_PROTO(const struct gfs2_sbd *sdp, int start),
+        TP_ARGS(sdp, start),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        int,    start                   )
+                __field(        u64,    log_seq                 )
+        ),
+        TP_fast_assign(
+                __entry->dev            = sdp->sd_vfs->s_dev;
+                __entry->start          = start;
+                __entry->log_seq        = sdp->sd_log_sequence;
+        ),
+        TP_printk("%u,%u log flush %s %llu",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->start ? "start" : "end",
+                  (unsigned long long)__entry->log_seq)
+);
+/* Reserving/releasing blocks in the log */
+TRACE_EVENT(gfs2_log_blocks,
+        TP_PROTO(const struct gfs2_sbd *sdp, int blocks),
+        TP_ARGS(sdp, blocks),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        int,    blocks                  )
+        ),
+        TP_fast_assign(
+                __entry->dev            = sdp->sd_vfs->s_dev;
+                __entry->blocks         = blocks;
+        ),
+        TP_printk("%u,%u log reserve %d", MAJOR(__entry->dev),
+                  MINOR(__entry->dev), __entry->blocks)
+);
+/* Section 3 - bmap
+ *
+ * Objectives:
+ * Latency: Bmap request time
+ * Performance: Block allocator tracing
+ * Correctness: Test of disard generation vs. blocks allocated
+ */
+/* Map an extent of blocks, possibly a new allocation */
+TRACE_EVENT(gfs2_bmap,
+        TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh,
+                sector_t lblock, int create, int errno),
+        TP_ARGS(ip, bh, lblock, create, errno),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        sector_t, lblock                )
+                __field(        sector_t, pblock                )
+                __field(        u64,    inum                    )
+                __field(        unsigned long, state            )
+                __field(        u32,    len                     )
+                __field(        int,    create                  )
+                __field(        int,    errno                   )
+        ),
+        TP_fast_assign(
+                __entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->lblock         = lblock;
+                __entry->pblock         = buffer_mapped(bh) ?  bh->b_blocknr : 0;
+                __entry->inum           = ip->i_no_addr;
+                __entry->state          = bh->b_state;
+                __entry->len            = bh->b_size;
+                __entry->create         = create;
+                __entry->errno          = errno;
+        ),
+        TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  (unsigned long long)__entry->inum,
+                  (unsigned long long)__entry->lblock,
+                  (unsigned long)__entry->len,
+                  (unsigned long long)__entry->pblock,
+                  __entry->state, __entry->create ? "create " : "nocreate",
+                  __entry->errno)
+);
+/* Keep track of blocks as they are allocated/freed */
+TRACE_EVENT(gfs2_block_alloc,
+        TP_PROTO(const struct gfs2_inode *ip, u64 block, unsigned len,
+                u8 block_state),
+        TP_ARGS(ip, block, len, block_state),
+        TP_STRUCT__entry(
+                __field(        dev_t,  dev                     )
+                __field(        u64,    start                   )
+                __field(        u64,    inum                    )
+                __field(        u32,    len                     )
+                __field(        u8,     block_state             )
+        ),
+        TP_fast_assign(
+                __entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->start          = block;
+                __entry->inum           = ip->i_no_addr;
+                __entry->len            = len;
+                __entry->block_state    = block_state;
+        ),
+        TP_printk("%u,%u bmap %llu alloc %llu/%lu %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  (unsigned long long)__entry->inum,
+                  (unsigned long long)__entry->start,
+                  (unsigned long)__entry->len,
+                  block_state_name(__entry->block_state))
+);
+#endif /* _TRACE_GFS2_H */
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#include <trace/define_trace.h>
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 053752d4b27f..4ef0e9fa3549 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
        BUG_ON(current->journal_info);
        BUG_ON(blocks == 0 && revokes == 0);
+        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+                return -EROFS;
        tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
        if (!tr)
                return -ENOMEM;
@@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
        if (error)
                goto fail_holder_uninit;
-        if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-                tr->tr_t_gh.gh_flags |= GL_NOCACHE;
-                error = -EROFS;
-                goto fail_gunlock;
-        }
        error = gfs2_log_reserve(sdp, tr->tr_reserved);
        if (error)
                goto fail_gunlock;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index a36bb749926d..6f833dc8e910 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -49,11 +49,23 @@ MODULE_LICENSE("GPL");
 */
 static void hfs_write_super(struct super_block *sb)
 {
+        lock_super(sb);
        sb->s_dirt = 0;
-        if (sb->s_flags & MS_RDONLY)
-                return;
        /* sync everything to the buffers */
+        if (!(sb->s_flags & MS_RDONLY))
+                hfs_mdb_commit(sb);
+        unlock_super(sb);
+}
+static int hfs_sync_fs(struct super_block *sb, int wait)
+{
+        lock_super(sb);
        hfs_mdb_commit(sb);
+        sb->s_dirt = 0;
+        unlock_super(sb);
+        return 0;
 }
 /*
@@ -65,9 +77,15 @@ static void hfs_write_super(struct super_block *sb)
 */
 static void hfs_put_super(struct super_block *sb)
 {
+        lock_kernel();
+        if (sb->s_dirt)
+                hfs_write_super(sb);
        hfs_mdb_close(sb);
        /* release the MDB's resources */
        hfs_mdb_put(sb);
+        unlock_kernel();
 }
 /*
@@ -164,6 +182,7 @@ static const struct super_operations hfs_super_operations = {
        .clear_inode    = hfs_clear_inode,
        .put_super      = hfs_put_super,
        .write_super    = hfs_write_super,
+        .sync_fs        = hfs_sync_fs,
        .statfs         = hfs_statfs,
        .remount_fs     = hfs_remount,
        .show_options   = hfs_show_options,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index f2a64020f42e..9fc3af0c0dab 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -152,15 +152,14 @@ static void hfsplus_clear_inode(struct inode *inode)
        }
 }
-static void hfsplus_write_super(struct super_block *sb)
+static int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
        struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
        dprint(DBG_SUPER, "hfsplus_write_super\n");
+        lock_super(sb);
        sb->s_dirt = 0;
-        if (sb->s_flags & MS_RDONLY)
-                /* warn? */
-                return;
        vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
        vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -192,6 +191,16 @@ static void hfsplus_write_super(struct super_block *sb)
                }
                HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
        }
+        unlock_super(sb);
+        return 0;
+}
+static void hfsplus_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                hfsplus_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static void hfsplus_put_super(struct super_block *sb)
@@ -199,6 +208,11 @@ static void hfsplus_put_super(struct super_block *sb)
        dprint(DBG_SUPER, "hfsplus_put_super\n");
        if (!sb->s_fs_info)
                return;
+        lock_kernel();
+        if (sb->s_dirt)
+                hfsplus_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
                struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
@@ -218,6 +232,8 @@ static void hfsplus_put_super(struct super_block *sb)
                unload_nls(HFSPLUS_SB(sb).nls);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -279,6 +295,7 @@ static const struct super_operations hfsplus_sops = {
        .clear_inode    = hfsplus_clear_inode,
        .put_super      = hfsplus_put_super,
        .write_super    = hfsplus_write_super,
+        .sync_fs        = hfsplus_sync_fs,
        .statfs         = hfsplus_statfs,
        .remount_fs     = hfsplus_remount,
        .show_options   = hfsplus_show_options,
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fc77965be841..f2feaa06bf26 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
@@ -99,11 +100,16 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
 static void hpfs_put_super(struct super_block *s)
 {
        struct hpfs_sb_info *sbi = hpfs_sb(s);
+        lock_kernel();
        kfree(sbi->sb_cp_table);
        kfree(sbi->sb_bmp_dir);
        unmark_dirty(s);
        s->s_fs_info = NULL;
        kfree(sbi);
+        unlock_kernel();
 }
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -393,6 +399,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        
        *flags |= MS_NOATIME;
        
+        lock_kernel();
+        lock_super(s);
        uid = sbi->sb_uid; gid = sbi->sb_gid;
        umask = 0777 & ~sbi->sb_mode;
        lowercase = sbi->sb_lowercase; conv = sbi->sb_conv;
@@ -425,9 +433,13 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
        replace_mount_options(s, new_opts);
+        unlock_super(s);
+        unlock_kernel();
        return 0;
 out_err:
+        unlock_super(s);
+        unlock_kernel();
        kfree(new_opts);
        return -EINVAL;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c1462d43e721..941c8425c10b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,6 +30,7 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <asm/uaccess.h>
@@ -986,6 +987,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
                        &hugetlbfs_file_operations);
        if (!file)
                goto out_dentry; /* inode is already attached */
+        ima_counts_get(file);
        return file;
diff --git a/fs/inode.c b/fs/inode.c
index bca0c618fdb3..f643be565df8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
@@ -189,6 +190,10 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_private = NULL;
        inode->i_mapping = mapping;
+#ifdef CONFIG_FSNOTIFY
+        inode->i_fsnotify_mask = 0;
+#endif
        return inode;
 out_free_security:
@@ -221,6 +226,7 @@ void destroy_inode(struct inode *inode)
        BUG_ON(inode_has_buffers(inode));
        ima_inode_free(inode);
        security_inode_free(inode);
+        fsnotify_inode_delete(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
@@ -252,6 +258,9 @@ void inode_init_once(struct inode *inode)
        INIT_LIST_HEAD(&inode->inotify_watches);
        mutex_init(&inode->inotify_mutex);
 #endif
+#ifdef CONFIG_FSNOTIFY
+        INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+#endif
 }
 EXPORT_SYMBOL(inode_init_once);
@@ -398,6 +407,7 @@ int invalidate_inodes(struct super_block *sb)
        mutex_lock(&iprune_mutex);
        spin_lock(&inode_lock);
        inotify_unmount_inodes(&sb->s_inodes);
+        fsnotify_unmount_inodes(&sb->s_inodes);
        busy = invalidate_list(&sb->s_inodes, &throw_away);
        spin_unlock(&inode_lock);
@@ -1398,7 +1408,7 @@ EXPORT_SYMBOL(touch_atime);
 *      for writeback.  Note that this function is meant exclusively for
 *      usage in the file write path of filesystems, and filesystems may
 *      choose to explicitly ignore update via this function with the
- *      S_NOCTIME inode flag, e.g. for network filesystem where these
+ *      S_NOCMTIME inode flag, e.g. for network filesystem where these
 *      timestamps are handled by the server.
 */
@@ -1412,7 +1422,7 @@ void file_update_time(struct file *file)
        if (IS_NOCMTIME(inode))
                return;
-        err = mnt_want_write(file->f_path.mnt);
+        err = mnt_want_write_file(file);
        if (err)
                return;
diff --git a/fs/internal.h b/fs/internal.h
index b4dac4fb6b61..d55ef562f0bb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
        return sb == blockdev_superblock;
 }
+extern int __sync_blockdev(struct block_device *bdev, int wait);
 #else
 static inline void bdev_cache_init(void)
 {
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 {
        return 0;
 }
+static inline int __sync_blockdev(struct block_device *bdev, int wait)
+{
+        return 0;
+}
 #endif
 /*
@@ -66,3 +73,13 @@ extern void __init mnt_init(void);
 * fs_struct.c
 */
 extern void chroot_fs_refs(struct path *, struct path *);
+/*
+ * file_table.c
+ */
+extern void mark_files_ro(struct super_block *);
+/*
+ * super.c
+ */
+extern int do_remount_sb(struct super_block *, int, void *, int);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 82d9c42b8bac..286f38dfc6c0 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
-        case FS_IOC_FIEMAP:
-                return ioctl_fiemap(filp, arg);
-        case FIGETBSZ:
-                return put_user(inode->i_sb->s_blocksize, p);
        case FIONREAD:
                return put_user(i_size_read(inode) - filp->f_pos, p);
        }
@@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
+        case FS_IOC_FIEMAP:
+                return ioctl_fiemap(filp, arg);
+        case FIGETBSZ:
+        {
+                struct inode *inode = filp->f_path.dentry->d_inode;
+                int __user *p = (int __user *)arg;
+                return put_user(inode->i_sb->s_blocksize, p);
+        }
        default:
                if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
                        error = file_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index b4cbe9603c7d..068b34b5a107 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -42,11 +42,16 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst
 static void isofs_put_super(struct super_block *sb)
 {
        struct isofs_sb_info *sbi = ISOFS_SB(sb);
 #ifdef CONFIG_JOLIET
+        lock_kernel();
        if (sbi->s_nls_iocharset) {
                unload_nls(sbi->s_nls_iocharset);
                sbi->s_nls_iocharset = NULL;
        }
+        unlock_kernel();
 #endif
        kfree(sbi);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 58144102bf25..62be7d294ec2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1781,7 +1781,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 * Journal abort has very specific semantics, which we describe
 * for journal abort.
 *
- * Two internal function, which provide abort to te jbd layer
+ * Two internal functions, which provide abort to the jbd layer
 * itself are here.
 */
@@ -1879,7 +1879,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
 * int jbd2_journal_errno () - returns the journal's error state.
 * @journal: journal to examine.
 *
- * This is the errno numbet set with jbd2_journal_abort(), the last
+ * This is the errno number set with jbd2_journal_abort(), the last
 * time the journal was mounted - if the journal was stopped
 * without calling abort this will be 0.
 *
@@ -1903,7 +1903,7 @@ int jbd2_journal_errno(journal_t *journal)
 * int jbd2_journal_clear_err () - clears the journal's error state
 * @journal: journal to act on.
 *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
 int jbd2_journal_clear_err(journal_t *journal)
@@ -1923,7 +1923,7 @@ int jbd2_journal_clear_err(journal_t *journal)
 * void jbd2_journal_ack_err() - Ack journal err.
 * @journal: journal to act on.
 *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
 * mode.
 */
 void jbd2_journal_ack_err(journal_t *journal)
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 249305d65d5b..3451a81b2142 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -20,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 #include "nodelist.h"
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -387,6 +388,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
           This also catches the case where it was stopped and this
           is just a remount to restart it.
           Flush the writebuffer, if neccecary, else we loose it */
+        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY)) {
                jffs2_stop_garbage_collect_thread(c);
                mutex_lock(&c->alloc_sem);
@@ -399,24 +401,10 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
        *flags |= MS_NOATIME;
+        unlock_kernel();
        return 0;
 }
-void jffs2_write_super (struct super_block *sb)
-{
-        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-        sb->s_dirt = 0;
-        if (sb->s_flags & MS_RDONLY)
-                return;
-        D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-        jffs2_garbage_collect_trigger(c);
-        jffs2_erase_pending_blocks(c, 0);
-        jffs2_flush_wbuf_gc(c, 0);
-}
 /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
   fill in the raw_inode while you're at it. */
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 5e194a5c8e29..2228380c47b9 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -181,7 +181,6 @@ void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
                               struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
-void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
 void jffs2_gc_release_inode(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 4c4e18c54a51..07a22caf2687 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -53,10 +53,29 @@ static void jffs2_i_init_once(void *foo)
        inode_init_once(&f->vfs_inode);
 }
+static void jffs2_write_super(struct super_block *sb)
+{
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+        lock_super(sb);
+        sb->s_dirt = 0;
+        if (!(sb->s_flags & MS_RDONLY)) {
+                D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+                jffs2_garbage_collect_trigger(c);
+                jffs2_erase_pending_blocks(c, 0);
+                jffs2_flush_wbuf_gc(c, 0);
+        }
+        unlock_super(sb);
+}
 static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+        jffs2_write_super(sb);
        mutex_lock(&c->alloc_sem);
        jffs2_flush_wbuf_pad(c);
        mutex_unlock(&c->alloc_sem);
@@ -174,6 +193,11 @@ static void jffs2_put_super (struct super_block *sb)
        D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
+        lock_kernel();
+        if (sb->s_dirt)
+                jffs2_write_super(sb);
        mutex_lock(&c->alloc_sem);
        jffs2_flush_wbuf_pad(c);
        mutex_unlock(&c->alloc_sem);
@@ -192,6 +216,8 @@ static void jffs2_put_super (struct super_block *sb)
        if (c->mtd->sync)
                c->mtd->sync(c->mtd);
+        unlock_kernel();
        D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 346057218edc..0fc30407f039 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
                        txAbort(tid, 0);
                        txEnd(tid);
+                        mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
                        /* release the inode map lock */
                        IWRITE_UNLOCK(ipimap);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6f21adf9479a..09b1b6ee2186 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -32,6 +32,7 @@
 #include <linux/crc32.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -183,6 +184,9 @@ static void jfs_put_super(struct super_block *sb)
        int rc;
        jfs_info("In jfs_put_super");
+        lock_kernel();
        rc = jfs_umount(sb);
        if (rc)
                jfs_err("jfs_umount failed with return code %d", rc);
@@ -195,6 +199,8 @@ static void jfs_put_super(struct super_block *sb)
        sbi->direct_inode = NULL;
        kfree(sbi);
+        unlock_kernel();
 }
 enum {
@@ -370,19 +376,24 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
        s64 newLVSize = 0;
        int rc = 0;
        int flag = JFS_SBI(sb)->flag;
+        int ret;
        if (!parse_options(data, sb, &newLVSize, &flag)) {
                return -EINVAL;
        }
+        lock_kernel();
        if (newLVSize) {
                if (sb->s_flags & MS_RDONLY) {
                        printk(KERN_ERR
                  "JFS: resize requires volume to be mounted read-write\n");
+                        unlock_kernel();
                        return -EROFS;
                }
                rc = jfs_extendfs(sb, newLVSize, 0);
-                if (rc)
+                if (rc) {
+                        unlock_kernel();
                        return rc;
+                }
        }
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -393,23 +404,31 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
                truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
                JFS_SBI(sb)->flag = flag;
-                return jfs_mount_rw(sb, 1);
+                ret = jfs_mount_rw(sb, 1);
+                unlock_kernel();
+                return ret;
        }
        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
                rc = jfs_umount_rw(sb);
                JFS_SBI(sb)->flag = flag;
+                unlock_kernel();
                return rc;
        }
        if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
                if (!(sb->s_flags & MS_RDONLY)) {
                        rc = jfs_umount_rw(sb);
-                        if (rc)
+                        if (rc) {
+                                unlock_kernel();
                                return rc;
+                        }
                        JFS_SBI(sb)->flag = flag;
-                        return jfs_mount_rw(sb, 1);
+                        ret = jfs_mount_rw(sb, 1);
+                        unlock_kernel();
+                        return ret;
                }
        JFS_SBI(sb)->flag = flag;
+        unlock_kernel();
        return 0;
 }
@@ -720,8 +739,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
                blk++;
        }
 out:
-        if (len == towrite)
+        if (len == towrite) {
+                mutex_unlock(&inode->i_mutex);
                return err;
+        }
        if (inode->i_size < off+len-towrite)
                i_size_write(inode, off+len-towrite);
        inode->i_version++;
diff --git a/fs/libfs.c b/fs/libfs.c
index 80046ddf5063..ddfa89948c3f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -9,6 +9,8 @@
 #include <linux/vfs.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 #include <asm/uaccess.h>
@@ -807,6 +809,29 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
+int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = 0, /* metadata-only; caller takes care of data */
+        };
+        struct inode *inode = dentry->d_inode;
+        int err;
+        int ret;
+        ret = sync_mapping_buffers(inode->i_mapping);
+        if (!(inode->i_state & I_DIRTY))
+                return ret;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                return ret;
+        err = sync_inode(inode, &wbc);
+        if (ret == 0)
+                ret = err;
+        return ret;
+}
+EXPORT_SYMBOL(simple_fsync);
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d4946c4c90e2..e5f206467e40 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ static int minix_readdir(struct file *, void *, filldir_t);
 const struct file_operations minix_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = minix_readdir,
-        .fsync          = minix_sync_file,
+        .fsync          = simple_fsync,
 };
 static inline void dir_put_page(struct page *page)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 17765f697e50..3eec3e607a87 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -6,15 +6,12 @@
 *  minix regular file handling primitives
 */
-#include <linux/buffer_head.h>          /* for fsync_inode_buffers() */
 #include "minix.h"
 /*
 * We have mostly NULLs here: the current defaults are OK for
 * the minix filesystem.
 */
-int minix_sync_file(struct file *, struct dentry *, int);
 const struct file_operations minix_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -22,7 +19,7 @@ const struct file_operations minix_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = minix_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -30,18 +27,3 @@ const struct inode_operations minix_file_inode_operations = {
        .truncate       = minix_truncate,
        .getattr        = minix_getattr,
 };
-int minix_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        
-        err |= minix_sync_inode(inode);
-        return err ? -EIO : 0;
-}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index daad3c2740db..f91a23693597 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -35,6 +35,8 @@ static void minix_put_super(struct super_block *sb)
        int i;
        struct minix_sb_info *sbi = minix_sb(sb);
+        lock_kernel();
        if (!(sb->s_flags & MS_RDONLY)) {
                if (sbi->s_version != MINIX_V3)  /* s_state is now out from V3 sb */
                        sbi->s_ms->s_state = sbi->s_mount_state;
@@ -49,7 +51,7 @@ static void minix_put_super(struct super_block *sb)
        sb->s_fs_info = NULL;
        kfree(sbi);
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache * minix_inode_cachep;
@@ -554,38 +556,25 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
        return bh;
 }
-static struct buffer_head *minix_update_inode(struct inode *inode)
+static int minix_write_inode(struct inode *inode, int wait)
-{
-        if (INODE_VERSION(inode) == MINIX_V1)
-                return V1_minix_update_inode(inode);
-        else
-                return V2_minix_update_inode(inode);
-}
-static int minix_write_inode(struct inode * inode, int wait)
-{
-        brelse(minix_update_inode(inode));
-        return 0;
-}
-int minix_sync_inode(struct inode * inode)
 {
        int err = 0;
        struct buffer_head *bh;
-        bh = minix_update_inode(inode);
+        if (INODE_VERSION(inode) == MINIX_V1)
-        if (bh && buffer_dirty(bh))
+                bh = V1_minix_update_inode(inode);
-        {
+        else
+                bh = V2_minix_update_inode(inode);
+        if (!bh)
+                return -EIO;
+        if (wait && buffer_dirty(bh)) {
                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh))
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                {
                        printk("IO error syncing minix inode [%s:%08lx]\n",
                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
+                        err = -EIO;
                }
        }
-        else if (!bh)
-                err = -1;
        brelse (bh);
        return err;
 }
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index e6a0b193bea4..cb7fdd11f9a5 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -57,7 +57,6 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping,
 extern void V1_minix_truncate(struct inode *);
 extern void V2_minix_truncate(struct inode *);
 extern void minix_truncate(struct inode *);
-extern int minix_sync_inode(struct inode *);
 extern void minix_set_inode(struct inode *, dev_t);
 extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
 extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
@@ -72,7 +71,6 @@ extern int minix_empty_dir(struct inode*);
 extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
 extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
 extern ino_t minix_inode_by_name(struct dentry*);
-extern int minix_sync_file(struct file *, struct dentry *, int);
 extern const struct inode_operations minix_file_inode_operations;
 extern const struct inode_operations minix_dir_inode_operations;
diff --git a/fs/mpage.c b/fs/mpage.c
index 680ba60863ff..42381bd6543b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-        clear_buffer_mapped(&map_bh);
+        map_bh.b_state = 0;
+        map_bh.b_size = 0;
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                struct page *page = list_entry(pages->prev, struct page, lru);
@@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
-        clear_buffer_mapped(&map_bh);
+        map_bh.b_state = 0;
+        map_bh.b_size = 0;
        bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
                        &map_bh, &first_logical_block, get_block);
        if (bio)
diff --git a/fs/namei.c b/fs/namei.c
index 967c3db92724..527119afb6a5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -552,6 +552,17 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
        return result;
 }
+static __always_inline void set_root(struct nameidata *nd)
+{
+        if (!nd->root.mnt) {
+                struct fs_struct *fs = current->fs;
+                read_lock(&fs->lock);
+                nd->root = fs->root;
+                path_get(&nd->root);
+                read_unlock(&fs->lock);
+        }
+}
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
        int res = 0;
@@ -560,14 +571,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
                goto fail;
        if (*link == '/') {
-                struct fs_struct *fs = current->fs;
+                set_root(nd);
                path_put(&nd->path);
+                nd->path = nd->root;
-                read_lock(&fs->lock);
+                path_get(&nd->root);
-                nd->path = fs->root;
-                path_get(&fs->root);
-                read_unlock(&fs->lock);
        }
        res = link_path_walk(link, nd);
@@ -668,23 +675,23 @@ loop:
        return err;
 }
-int follow_up(struct vfsmount **mnt, struct dentry **dentry)
+int follow_up(struct path *path)
 {
        struct vfsmount *parent;
        struct dentry *mountpoint;
        spin_lock(&vfsmount_lock);
-        parent=(*mnt)->mnt_parent;
+        parent = path->mnt->mnt_parent;
-        if (parent == *mnt) {
+        if (parent == path->mnt) {
                spin_unlock(&vfsmount_lock);
                return 0;
        }
        mntget(parent);
-        mountpoint=dget((*mnt)->mnt_mountpoint);
+        mountpoint = dget(path->mnt->mnt_mountpoint);
        spin_unlock(&vfsmount_lock);
-        dput(*dentry);
+        dput(path->dentry);
-        *dentry = mountpoint;
+        path->dentry = mountpoint;
-        mntput(*mnt);
+        mntput(path->mnt);
-        *mnt = parent;
+        path->mnt = parent;
        return 1;
 }
@@ -695,7 +702,7 @@ static int __follow_mount(struct path *path)
 {
        int res = 0;
        while (d_mountpoint(path->dentry)) {
-                struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
+                struct vfsmount *mounted = lookup_mnt(path);
                if (!mounted)
                        break;
                dput(path->dentry);
@@ -708,32 +715,32 @@ static int __follow_mount(struct path *path)
        return res;
 }
-static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static void follow_mount(struct path *path)
 {
-        while (d_mountpoint(*dentry)) {
+        while (d_mountpoint(path->dentry)) {
-                struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
+                struct vfsmount *mounted = lookup_mnt(path);
                if (!mounted)
                        break;
-                dput(*dentry);
+                dput(path->dentry);
-                mntput(*mnt);
+                mntput(path->mnt);
-                *mnt = mounted;
+                path->mnt = mounted;
-                *dentry = dget(mounted->mnt_root);
+                path->dentry = dget(mounted->mnt_root);
        }
 }
 /* no need for dcache_lock, as serialization is taken care in
 * namespace.c
 */
-int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+int follow_down(struct path *path)
 {
        struct vfsmount *mounted;
-        mounted = lookup_mnt(*mnt, *dentry);
+        mounted = lookup_mnt(path);
        if (mounted) {
-                dput(*dentry);
+                dput(path->dentry);
-                mntput(*mnt);
+                mntput(path->mnt);
-                *mnt = mounted;
+                path->mnt = mounted;
-                *dentry = dget(mounted->mnt_root);
+                path->dentry = dget(mounted->mnt_root);
                return 1;
        }
        return 0;
@@ -741,19 +748,16 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 static __always_inline void follow_dotdot(struct nameidata *nd)
 {
-        struct fs_struct *fs = current->fs;
+        set_root(nd);
        while(1) {
                struct vfsmount *parent;
                struct dentry *old = nd->path.dentry;
-                read_lock(&fs->lock);
+                if (nd->path.dentry == nd->root.dentry &&
-                if (nd->path.dentry == fs->root.dentry &&
+                    nd->path.mnt == nd->root.mnt) {
-                    nd->path.mnt == fs->root.mnt) {
-                        read_unlock(&fs->lock);
                        break;
                }
-                read_unlock(&fs->lock);
                spin_lock(&dcache_lock);
                if (nd->path.dentry != nd->path.mnt->mnt_root) {
                        nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -775,7 +779,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
                mntput(nd->path.mnt);
                nd->path.mnt = parent;
        }
-        follow_mount(&nd->path.mnt, &nd->path.dentry);
+        follow_mount(&nd->path);
 }
 /*
@@ -853,7 +857,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        err = inode_permission(nd->path.dentry->d_inode,
                                               MAY_EXEC);
                if (!err)
-                        err = ima_path_check(&nd->path, MAY_EXEC);
+                        err = ima_path_check(&nd->path, MAY_EXEC,
+                                             IMA_COUNT_UPDATE);
                if (err)
                        break;
@@ -1016,25 +1021,23 @@ static int path_walk(const char *name, struct nameidata *nd)
        return link_path_walk(name, nd);
 }
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
-static int do_path_lookup(int dfd, const char *name,
-                                unsigned int flags, struct nameidata *nd)
 {
        int retval = 0;
        int fput_needed;
        struct file *file;
-        struct fs_struct *fs = current->fs;
        nd->last_type = LAST_ROOT; /* if there are only slashes... */
        nd->flags = flags;
        nd->depth = 0;
+        nd->root.mnt = NULL;
        if (*name=='/') {
-                read_lock(&fs->lock);
+                set_root(nd);
-                nd->path = fs->root;
+                nd->path = nd->root;
-                path_get(&fs->root);
+                path_get(&nd->root);
-                read_unlock(&fs->lock);
        } else if (dfd == AT_FDCWD) {
+                struct fs_struct *fs = current->fs;
                read_lock(&fs->lock);
                nd->path = fs->pwd;
                path_get(&fs->pwd);
@@ -1062,17 +1065,29 @@ static int do_path_lookup(int dfd, const char *name,
                fput_light(file, fput_needed);
        }
+        return 0;
-        retval = path_walk(name, nd);
+fput_fail:
+        fput_light(file, fput_needed);
+out_fail:
+        return retval;
+}
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int do_path_lookup(int dfd, const char *name,
+                                unsigned int flags, struct nameidata *nd)
+{
+        int retval = path_init(dfd, name, flags, nd);
+        if (!retval)
+                retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
                                nd->path.dentry->d_inode))
                audit_inode(name, nd->path.dentry);
-out_fail:
+        if (nd->root.mnt) {
+                path_put(&nd->root);
+                nd->root.mnt = NULL;
+        }
        return retval;
-fput_fail:
-        fput_light(file, fput_needed);
-        goto out_fail;
 }
 int path_lookup(const char *name, unsigned int flags,
@@ -1112,14 +1127,18 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        nd->path.dentry = dentry;
        nd->path.mnt = mnt;
        path_get(&nd->path);
+        nd->root = nd->path;
+        path_get(&nd->root);
        retval = path_walk(name, nd);
        if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
                                nd->path.dentry->d_inode))
                audit_inode(name, nd->path.dentry);
-        return retval;
+        path_put(&nd->root);
+        nd->root.mnt = NULL;
+        return retval;
 }
 /**
@@ -1515,7 +1534,8 @@ int may_open(struct path *path, int acc_mode, int flag)
                return error;
        error = ima_path_check(path,
-                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                               IMA_COUNT_UPDATE);
        if (error)
                return error;
        /*
@@ -1674,9 +1694,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
        /*
         * Create - we need to know the parent.
         */
-        error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+        error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
        if (error)
                return ERR_PTR(error);
+        error = path_walk(pathname, &nd);
+        if (error)
+                return ERR_PTR(error);
+        if (unlikely(!audit_dummy_context()))
+                audit_inode(pathname, nd.path.dentry);
        /*
         * We have the parent and last component. First of all, check
@@ -1804,6 +1829,8 @@ exit:
        if (!IS_ERR(nd.intent.open.file))
                release_open_intent(&nd);
 exit_parent:
+        if (nd.root.mnt)
+                path_put(&nd.root);
        path_put(&nd.path);
        return ERR_PTR(error);
diff --git a/fs/namespace.c b/fs/namespace.c
index 134d494158d9..2dd333b0fe7f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
-                atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+                mnt->mnt_writers = alloc_percpu(int);
+                if (!mnt->mnt_writers)
+                        goto out_free_devname;
+#else
+                mnt->mnt_writers = 0;
+#endif
        }
        return mnt;
+#ifdef CONFIG_SMP
+out_free_devname:
+        kfree(mnt->mnt_devname);
+#endif
 out_free_id:
        mnt_free_id(mnt);
 out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
-struct mnt_writer {
+static inline void inc_mnt_writers(struct vfsmount *mnt)
-        /*
+{
-         * If holding multiple instances of this lock, they
+#ifdef CONFIG_SMP
-         * must be ordered by cpu number.
+        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
-         */
+#else
-        spinlock_t lock;
+        mnt->mnt_writers++;
-        struct lock_class_key lock_class; /* compiles out with !lockdep */
+#endif
-        unsigned long count;
+}
-        struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
 {
-        int cpu;
+#ifdef CONFIG_SMP
-        for_each_possible_cpu(cpu) {
+        (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
-                struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+#else
-                spin_lock_init(&writer->lock);
+        mnt->mnt_writers--;
-                lockdep_set_class(&writer->lock, &writer->lock_class);
+#endif
-                writer->count = 0;
-        }
-        return 0;
 }
-fs_initcall(init_mnt_writers);
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
+        unsigned int count = 0;
        int cpu;
-        struct mnt_writer *cpu_writer;
        for_each_possible_cpu(cpu) {
-                cpu_writer = &per_cpu(mnt_writers, cpu);
+                count += *per_cpu_ptr(mnt->mnt_writers, cpu);
-                spin_unlock(&cpu_writer->lock);
        }
-}
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+        return count;
-{
+#else
-        if (!cpu_writer->mnt)
+        return mnt->mnt_writers;
-                return;
+#endif
-        /*
-         * This is in case anyone ever leaves an invalid,
-         * old ->mnt and a count of 0.
-         */
-        if (!cpu_writer->count)
-                return;
-        atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
-        cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
-                                          struct vfsmount *mnt)
-{
-        if (cpu_writer->mnt == mnt)
-                return;
-        __clear_mnt_count(cpu_writer);
-        cpu_writer->mnt = mnt;
 }
 /*
@@ -253,74 +236,73 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
 int mnt_want_write(struct vfsmount *mnt)
 {
        int ret = 0;
-        struct mnt_writer *cpu_writer;
-        cpu_writer = &get_cpu_var(mnt_writers);
+        preempt_disable();
-        spin_lock(&cpu_writer->lock);
+        inc_mnt_writers(mnt);
+        /*
+         * The store to inc_mnt_writers must be visible before we pass
+         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+         * incremented count after it has set MNT_WRITE_HOLD.
+         */
+        smp_mb();
+        while (mnt->mnt_flags & MNT_WRITE_HOLD)
+                cpu_relax();
+        /*
+         * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+         * be set to match its requirements. So we must not load that until
+         * MNT_WRITE_HOLD is cleared.
+         */
+        smp_rmb();
        if (__mnt_is_readonly(mnt)) {
+                dec_mnt_writers(mnt);
                ret = -EROFS;
                goto out;
        }
-        use_cpu_writer_for_mount(cpu_writer, mnt);
-        cpu_writer->count++;
 out:
-        spin_unlock(&cpu_writer->lock);
+        preempt_enable();
-        put_cpu_var(mnt_writers);
        return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
-static void lock_mnt_writers(void)
+/**
-{
+ * mnt_clone_write - get write access to a mount
-        int cpu;
+ * @mnt: the mount on which to take a write
-        struct mnt_writer *cpu_writer;
+ *
+ * This is effectively like mnt_want_write, except
-        for_each_possible_cpu(cpu) {
+ * it must only be used to take an extra write reference
-                cpu_writer = &per_cpu(mnt_writers, cpu);
+ * on a mountpoint that we already know has a write reference
-                spin_lock(&cpu_writer->lock);
+ * on it. This allows some optimisation.
-                __clear_mnt_count(cpu_writer);
+ *
-                cpu_writer->mnt = NULL;
+ * After finished, mnt_drop_write must be called as usual to
-        }
+ * drop the reference.
+ */
+int mnt_clone_write(struct vfsmount *mnt)
+{
+        /* superblock may be r/o */
+        if (__mnt_is_readonly(mnt))
+                return -EROFS;
+        preempt_disable();
+        inc_mnt_writers(mnt);
+        preempt_enable();
+        return 0;
 }
+EXPORT_SYMBOL_GPL(mnt_clone_write);
-/*
+/**
- * These per-cpu write counts are not guaranteed to have
+ * mnt_want_write_file - get write access to a file's mount
- * matched increments and decrements on any given cpu.
+ * @file: the file who's mount on which to take a write
- * A file open()ed for write on one cpu and close()d on
+ *
- * another cpu will imbalance this count.  Make sure it
+ * This is like mnt_want_write, but it takes a file and can
- * does not get too far out of whack.
+ * do some optimisations if the file is open for write already
 */
-static void handle_write_count_underflow(struct vfsmount *mnt)
+int mnt_want_write_file(struct file *file)
 {
-        if (atomic_read(&mnt->__mnt_writers) >=
+        if (!(file->f_mode & FMODE_WRITE))
-            MNT_WRITER_UNDERFLOW_LIMIT)
+                return mnt_want_write(file->f_path.mnt);
-                return;
+        else
-        /*
+                return mnt_clone_write(file->f_path.mnt);
-         * It isn't necessary to hold all of the locks
-         * at the same time, but doing it this way makes
-         * us share a lot more code.
-         */
-        lock_mnt_writers();
-        /*
-         * vfsmount_lock is for mnt_flags.
-         */
-        spin_lock(&vfsmount_lock);
-        /*
-         * If coalescing the per-cpu writer counts did not
-         * get us back to a positive writer count, we have
-         * a bug.
-         */
-        if ((atomic_read(&mnt->__mnt_writers) < 0) &&
-            !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-                WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
-                                "count: %d\n",
-                        mnt, atomic_read(&mnt->__mnt_writers));
-                /* use the flag to keep the dmesg spam down */
-                mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
-        }
-        spin_unlock(&vfsmount_lock);
-        unlock_mnt_writers();
 }
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
 /**
 * mnt_drop_write - give up write access to a mount
@@ -332,37 +314,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
 */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-        int must_check_underflow = 0;
+        preempt_disable();
-        struct mnt_writer *cpu_writer;
+        dec_mnt_writers(mnt);
+        preempt_enable();
-        cpu_writer = &get_cpu_var(mnt_writers);
-        spin_lock(&cpu_writer->lock);
-        use_cpu_writer_for_mount(cpu_writer, mnt);
-        if (cpu_writer->count > 0) {
-                cpu_writer->count--;
-        } else {
-                must_check_underflow = 1;
-                atomic_dec(&mnt->__mnt_writers);
-        }
-        spin_unlock(&cpu_writer->lock);
-        /*
-         * Logically, we could call this each time,
-         * but the __mnt_writers cacheline tends to
-         * be cold, and makes this expensive.
-         */
-        if (must_check_underflow)
-                handle_write_count_underflow(mnt);
-        /*
-         * This could be done right after the spinlock
-         * is taken because the spinlock keeps us on
-         * the cpu, and disables preemption.  However,
-         * putting it here bounds the amount that
-         * __mnt_writers can underflow.  Without it,
-         * we could theoretically wrap __mnt_writers.
-         */
-        put_cpu_var(mnt_writers);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
@@ -370,24 +324,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
        int ret = 0;
-        lock_mnt_writers();
+        spin_lock(&vfsmount_lock);
+        mnt->mnt_flags |= MNT_WRITE_HOLD;
        /*
-         * With all the locks held, this value is stable
+         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+         * should be visible before we do.
         */
-        if (atomic_read(&mnt->__mnt_writers) > 0) {
+        smp_mb();
-                ret = -EBUSY;
-                goto out;
-        }
        /*
-         * nobody can do a successful mnt_want_write() with all
+         * With writers on hold, if this value is zero, then there are
-         * of the counts in MNT_DENIED_WRITE and the locks held.
+         * definitely no active writers (although held writers may subsequently
+         * increment the count, they'll have to wait, and decrement it after
+         * seeing MNT_READONLY).
+         *
+         * It is OK to have counter incremented on one CPU and decremented on
+         * another: the sum will add up correctly. The danger would be when we
+         * sum up each counter, if we read a counter before it is incremented,
+         * but then read another CPU's count which it has been subsequently
+         * decremented from -- we would see more decrements than we should.
+         * MNT_WRITE_HOLD protects against this scenario, because
+         * mnt_want_write first increments count, then smp_mb, then spins on
+         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+         * we're counting up here.
         */
-        spin_lock(&vfsmount_lock);
+        if (count_mnt_writers(mnt) > 0)
-        if (!ret)
+                ret = -EBUSY;
+        else
                mnt->mnt_flags |= MNT_READONLY;
+        /*
+         * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+         * that become unheld will see MNT_READONLY.
+         */
+        smp_wmb();
+        mnt->mnt_flags &= ~MNT_WRITE_HOLD;
        spin_unlock(&vfsmount_lock);
-out:
-        unlock_mnt_writers();
        return ret;
 }
@@ -410,6 +381,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
        kfree(mnt->mnt_devname);
        mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+        free_percpu(mnt->mnt_writers);
+#endif
        kmem_cache_free(mnt_cache, mnt);
 }
@@ -442,11 +416,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 * lookup_mnt increments the ref count before returning
 * the vfsmount struct.
 */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *lookup_mnt(struct path *path)
 {
        struct vfsmount *child_mnt;
        spin_lock(&vfsmount_lock);
-        if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
+        if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
                mntget(child_mnt);
        spin_unlock(&vfsmount_lock);
        return child_mnt;
@@ -604,38 +578,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 static inline void __mntput(struct vfsmount *mnt)
 {
-        int cpu;
        struct super_block *sb = mnt->mnt_sb;
        /*
-         * We don't have to hold all of the locks at the
-         * same time here because we know that we're the
-         * last reference to mnt and that no new writers
-         * can come in.
-         */
-        for_each_possible_cpu(cpu) {
-                struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-                spin_lock(&cpu_writer->lock);
-                if (cpu_writer->mnt != mnt) {
-                        spin_unlock(&cpu_writer->lock);
-                        continue;
-                }
-                atomic_add(cpu_writer->count, &mnt->__mnt_writers);
-                cpu_writer->count = 0;
-                /*
-                 * Might as well do this so that no one
-                 * ever sees the pointer and expects
-                 * it to be valid.
-                 */
-                cpu_writer->mnt = NULL;
-                spin_unlock(&cpu_writer->lock);
-        }
-        /*
         * This probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this
         * happens, the filesystem was probably unable
         * to make r/w->r/o transitions.
         */
-        WARN_ON(atomic_read(&mnt->__mnt_writers));
+        /*
+         * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+         * provides barriers, so count_mnt_writers() below is safe.  AV
+         */
+        WARN_ON(count_mnt_writers(mnt));
        dput(mnt->mnt_root);
        free_vfsmnt(mnt);
        deactivate_super(sb);
@@ -1106,11 +1060,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
                 * we just try to remount it readonly.
                 */
                down_write(&sb->s_umount);
-                if (!(sb->s_flags & MS_RDONLY)) {
+                if (!(sb->s_flags & MS_RDONLY))
-                        lock_kernel();
                        retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
-                        unlock_kernel();
-                }
                up_write(&sb->s_umount);
                return retval;
        }
@@ -1253,11 +1204,11 @@ Enomem:
        return NULL;
 }
-struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *collect_mounts(struct path *path)
 {
        struct vfsmount *tree;
        down_write(&namespace_sem);
-        tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
+        tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
        up_write(&namespace_sem);
        return tree;
 }
@@ -1430,7 +1381,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
                goto out_unlock;
        err = -ENOENT;
-        if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
+        if (!d_unlinked(path->dentry))
                err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
        mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1601,7 +1552,7 @@ static int do_move_mount(struct path *path, char *old_name)
        down_write(&namespace_sem);
        while (d_mountpoint(path->dentry) &&
-               follow_down(&path->mnt, &path->dentry))
+               follow_down(path))
                ;
        err = -EINVAL;
        if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1612,7 +1563,7 @@ static int do_move_mount(struct path *path, char *old_name)
        if (IS_DEADDIR(path->dentry->d_inode))
                goto out1;
-        if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
+        if (d_unlinked(path->dentry))
                goto out1;
        err = -EINVAL;
@@ -1676,7 +1627,9 @@ static int do_new_mount(struct path *path, char *type, int flags,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        lock_kernel();
        mnt = do_kern_mount(type, flags, name, data);
+        unlock_kernel();
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
@@ -1695,10 +1648,10 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
        down_write(&namespace_sem);
        /* Something was mounted here while we slept */
        while (d_mountpoint(path->dentry) &&
-               follow_down(&path->mnt, &path->dentry))
+               follow_down(path))
                ;
        err = -EINVAL;
-        if (!check_mnt(path->mnt))
+        if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
                goto unlock;
        /* Refuse the same filesystem on the same mount point */
@@ -2092,10 +2045,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
        if (retval < 0)
                goto out3;
-        lock_kernel();
        retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
                          flags, (void *)data_page);
-        unlock_kernel();
        free_page(data_page);
 out3:
@@ -2175,9 +2126,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        error = -ENOENT;
        if (IS_DEADDIR(new.dentry->d_inode))
                goto out2;
-        if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
+        if (d_unlinked(new.dentry))
                goto out2;
-        if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
+        if (d_unlinked(old.dentry))
                goto out2;
        error = -EBUSY;
        if (new.mnt == root.mnt ||
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d642f0e5b365..b99ce205b1bd 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -736,6 +736,8 @@ static void ncp_put_super(struct super_block *sb)
 {
        struct ncp_server *server = NCP_SBP(sb);
+        lock_kernel();
        ncp_lock_server(server);
        ncp_disconnect(server);
        ncp_unlock_server(server);
@@ -769,6 +771,8 @@ static void ncp_put_super(struct super_block *sb)
        vfree(server->packet);
        sb->s_fs_info = NULL;
        kfree(server);
+        unlock_kernel();
 }
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 64a288ee046d..f01caec84463 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -154,7 +154,7 @@ out_err:
        goto out;
 out_follow:
        while (d_mountpoint(nd->path.dentry) &&
-               follow_down(&nd->path.mnt, &nd->path.dentry))
+               follow_down(&nd->path))
                ;
        err = 0;
        goto out;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2d67781c579..26127b69a275 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1813,6 +1813,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        if (data == NULL)
                return -ENOMEM;
+        lock_kernel();
        /* fill out struct with values from existing mount */
        data->flags = nfss->flags;
        data->rsize = nfss->rsize;
@@ -1837,6 +1838,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
        error = nfs_compare_remount_data(nfss, data);
 out:
        kfree(data);
+        unlock_kernel();
        return error;
 }
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5839b229cd0e..8b1f8efb4690 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -847,9 +847,8 @@ exp_get_fsid_key(svc_client *clp, int fsid)
        return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
-static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
+static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
-                                   struct dentry *dentry,
+                                     struct cache_req *reqp)
-                                   struct cache_req *reqp)
 {
        struct svc_export *exp, key;
        int err;
@@ -858,8 +857,7 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
                return ERR_PTR(-ENOENT);
        key.ex_client = clp;
-        key.ex_path.mnt = mnt;
+        key.ex_path = *path;
-        key.ex_path.dentry = dentry;
        exp = svc_export_lookup(&key);
        if (exp == NULL)
@@ -873,24 +871,19 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
 /*
 * Find the export entry for a given dentry.
 */
-static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt,
+static struct svc_export *exp_parent(svc_client *clp, struct path *path)
-                                     struct dentry *dentry,
-                                     struct cache_req *reqp)
 {
-        svc_export *exp;
+        struct dentry *saved = dget(path->dentry);
+        svc_export *exp = exp_get_by_name(clp, path, NULL);
-        dget(dentry);
-        exp = exp_get_by_name(clp, mnt, dentry, reqp);
+        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+                struct dentry *parent = dget_parent(path->dentry);
-        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
+                dput(path->dentry);
-                struct dentry *parent;
+                path->dentry = parent;
+                exp = exp_get_by_name(clp, path, NULL);
-                parent = dget_parent(dentry);
-                dput(dentry);
-                dentry = parent;
-                exp = exp_get_by_name(clp, mnt, dentry, reqp);
        }
-        dput(dentry);
+        dput(path->dentry);
+        path->dentry = saved;
        return exp;
 }
@@ -1018,7 +1011,7 @@ exp_export(struct nfsctl_export *nxp)
                goto out_put_clp;
        err = -EINVAL;
-        exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL);
+        exp = exp_get_by_name(clp, &path, NULL);
        memset(&new, 0, sizeof(new));
@@ -1135,7 +1128,7 @@ exp_unexport(struct nfsctl_export *nxp)
                goto out_domain;
        err = -EINVAL;
-        exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL);
+        exp = exp_get_by_name(dom, &path, NULL);
        path_put(&path);
        if (IS_ERR(exp))
                goto out_domain;
@@ -1177,7 +1170,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
        dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
                 name, path.dentry, clp->name,
                 inode->i_sb->s_id, inode->i_ino);
-        exp = exp_parent(clp, path.mnt, path.dentry, NULL);
+        exp = exp_parent(clp, &path);
        if (IS_ERR(exp)) {
                err = PTR_ERR(exp);
                goto out;
@@ -1207,7 +1200,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type,
        if (IS_ERR(ek))
                return ERR_CAST(ek);
-        exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp);
+        exp = exp_get_by_name(clp, &ek->ek_path, reqp);
        cache_put(&ek->h, &svc_expkey_cache);
        if (IS_ERR(exp))
@@ -1247,8 +1240,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
 * use exp_get_by_name() or exp_find().
 */
 struct svc_export *
-rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
-                struct dentry *dentry)
 {
        struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
@@ -1256,8 +1248,7 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
                goto gss;
        /* First try the auth_unix client: */
-        exp = exp_get_by_name(rqstp->rq_client, mnt, dentry,
+        exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
-                                                &rqstp->rq_chandle);
        if (PTR_ERR(exp) == -ENOENT)
                goto gss;
        if (IS_ERR(exp))
@@ -1269,8 +1260,7 @@ gss:
        /* Otherwise, try falling back on gss client */
        if (rqstp->rq_gssclient == NULL)
                return exp;
-        gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry,
+        gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
-                                                &rqstp->rq_chandle);
        if (PTR_ERR(gssexp) == -ENOENT)
                return exp;
        if (!IS_ERR(exp))
@@ -1309,23 +1299,19 @@ gss:
 }
 struct svc_export *
-rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
+rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
-                struct dentry *dentry)
 {
-        struct svc_export *exp;
+        struct dentry *saved = dget(path->dentry);
+        struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
-        dget(dentry);
-        exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+                struct dentry *parent = dget_parent(path->dentry);
-        while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
+                dput(path->dentry);
-                struct dentry *parent;
+                path->dentry = parent;
+                exp = rqst_exp_get_by_name(rqstp, path);
-                parent = dget_parent(dentry);
-                dput(dentry);
-                dentry = parent;
-                exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
        }
-        dput(dentry);
+        dput(path->dentry);
+        path->dentry = saved;
        return exp;
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b660435978d2..99f835753596 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -55,6 +55,7 @@
 #include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
 #include <linux/jhash.h>
+#include <linux/ima.h>
 #include <asm/uaccess.h>
@@ -100,36 +101,35 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 {
        struct svc_export *exp = *expp, *exp2 = NULL;
        struct dentry *dentry = *dpp;
-        struct vfsmount *mnt = mntget(exp->ex_path.mnt);
+        struct path path = {.mnt = mntget(exp->ex_path.mnt),
-        struct dentry *mounts = dget(dentry);
+                            .dentry = dget(dentry)};
        int err = 0;
-        while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
+        while (d_mountpoint(path.dentry) && follow_down(&path))
+                ;
-        exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
+        exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
                if (PTR_ERR(exp2) != -ENOENT)
                        err = PTR_ERR(exp2);
-                dput(mounts);
+                path_put(&path);
-                mntput(mnt);
                goto out;
        }
        if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
                /* successfully crossed mount point */
                /*
-                 * This is subtle: dentry is *not* under mnt at this point.
+                 * This is subtle: path.dentry is *not* on path.mnt
-                 * The only reason we are safe is that original mnt is pinned
+                 * at this point.  The only reason we are safe is that
-                 * down by exp, so we should dput before putting exp.
+                 * original mnt is pinned down by exp, so we should
+                 * put path *before* putting exp
                 */
-                dput(dentry);
+                *dpp = path.dentry;
-                *dpp = mounts;
+                path.dentry = dentry;
-                exp_put(exp);
                *expp = exp2;
-        } else {
+                exp2 = exp;
-                exp_put(exp2);
-                dput(mounts);
        }
-        mntput(mnt);
+        path_put(&path);
+        exp_put(exp2);
 out:
        return err;
 }
@@ -168,28 +168,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        /* checking mountpoint crossing is very different when stepping up */
                        struct svc_export *exp2 = NULL;
                        struct dentry *dp;
-                        struct vfsmount *mnt = mntget(exp->ex_path.mnt);
+                        struct path path = {.mnt = mntget(exp->ex_path.mnt),
-                        dentry = dget(dparent);
+                                            .dentry = dget(dparent)};
-                        while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
+                        while (path.dentry == path.mnt->mnt_root &&
+                               follow_up(&path))
                                ;
-                        dp = dget_parent(dentry);
+                        dp = dget_parent(path.dentry);
-                        dput(dentry);
+                        dput(path.dentry);
-                        dentry = dp;
+                        path.dentry = dp;
-                        exp2 = rqst_exp_parent(rqstp, mnt, dentry);
+                        exp2 = rqst_exp_parent(rqstp, &path);
                        if (PTR_ERR(exp2) == -ENOENT) {
-                                dput(dentry);
                                dentry = dget(dparent);
                        } else if (IS_ERR(exp2)) {
                                host_err = PTR_ERR(exp2);
-                                dput(dentry);
+                                path_put(&path);
-                                mntput(mnt);
                                goto out_nfserr;
                        } else {
+                                dentry = dget(path.dentry);
                                exp_put(exp);
                                exp = exp2;
                        }
-                        mntput(mnt);
+                        path_put(&path);
                }
        } else {
                fh_lock(fhp);
@@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                            flags, cred);
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
+        else
+                ima_counts_get(*filp);
 out_nfserr:
        err = nfserrno(host_err);
 out:
@@ -2024,6 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                                        struct dentry *dentry, int acc)
 {
        struct inode    *inode = dentry->d_inode;
+        struct path     path;
        int             err;
        if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2100,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
            acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
                err = inode_permission(inode, MAY_EXEC);
+        if (err)
+                goto nfsd_out;
+        /* Do integrity (permission) checking now, but defer incrementing
+         * IMA counts to the actual file open.
+         */
+        path.mnt = exp->ex_path.mnt;
+        path.dentry = dentry;
+        err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                             IMA_COUNT_LEAVE);
+nfsd_out:
        return err? nfserrno(err) : 0;
 }
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 064279e33bbb..36df60b6d8a4 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -31,21 +31,26 @@
 #include "dat.h"
 #include "alloc.h"
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+}
 int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
                               __u64 *ptrp)
 {
-        __u64 ptr;
+        sector_t blocknr;
        int ret;
        down_read(&bmap->b_sem);
        ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
        if (ret < 0)
                goto out;
-        if (bmap->b_pops->bpop_translate != NULL) {
+        if (NILFS_BMAP_USE_VBN(bmap)) {
-                ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
+                ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
-                if (ret < 0)
+                                          &blocknr);
-                        goto out;
+                if (!ret)
-                *ptrp = ptr;
+                        *ptrp = blocknr;
        }
 out:
@@ -53,6 +58,16 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
        return ret;
 }
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
+                             unsigned maxblocks)
+{
+        int ret;
+        down_read(&bmap->b_sem);
+        ret = bmap->b_ops->bop_lookup_contig(bmap, key, ptrp, maxblocks);
+        up_read(&bmap->b_sem);
+        return ret;
+}
 /**
 * nilfs_bmap_lookup - find a record
@@ -101,8 +116,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
                        if (n < 0)
                                return n;
                        ret = nilfs_btree_convert_and_insert(
-                                bmap, key, ptr, keys, ptrs, n,
+                                bmap, key, ptr, keys, ptrs, n);
-                                NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
                        if (ret == 0)
                                bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
@@ -158,8 +172,7 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
                        if (n < 0)
                                return n;
                        ret = nilfs_direct_delete_and_convert(
-                                bmap, key, keys, ptrs, n,
+                                bmap, key, keys, ptrs, n);
-                                NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
                        if (ret == 0)
                                bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
@@ -417,38 +430,6 @@ void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
                mark_inode_dirty(bmap->b_inode);
 }
-int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
-                         struct buffer_head **bhp)
-{
-        return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
-                                ptr, 0, bhp, 0);
-}
-void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
-                          struct buffer_head *bh)
-{
-        brelse(bh);
-}
-int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
-                             struct buffer_head **bhp)
-{
-        int ret;
-        ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
-                               ptr, 0, bhp, 1);
-        if (ret < 0)
-                return ret;
-        set_buffer_nilfs_volatile(*bhp);
-        return 0;
-}
-void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
-                             struct buffer_head *bh)
-{
-        nilfs_btnode_delete(bh);
-}
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
                              const struct buffer_head *bh)
 {
@@ -476,11 +457,6 @@ __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
                return NILFS_BMAP_INVALID_PTR;
 }
-static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
-{
-        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
-}
 #define NILFS_BMAP_GROUP_DIV    8
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
 {
@@ -493,64 +469,51 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
                (entries_per_group / NILFS_BMAP_GROUP_DIV);
 }
-static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
+int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req)
+                                 union nilfs_bmap_ptr_req *req)
 {
        return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req)
+                                 union nilfs_bmap_ptr_req *req)
 {
        nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
-                                     union nilfs_bmap_ptr_req *req)
+                              union nilfs_bmap_ptr_req *req)
 {
        nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
+int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
-                                      union nilfs_bmap_ptr_req *req)
+                       sector_t blocknr)
 {
-        return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
-}
+        int ret;
-static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req,
-                                      sector_t blocknr)
-{
-        nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
-                               blocknr);
-}
-static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
+        ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
-                                     union nilfs_bmap_ptr_req *req)
+        if (likely(!ret))
-{
+                nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
-        nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+        return ret;
 }
-static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
+int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
-                                    union nilfs_bmap_ptr_req *req)
+                             union nilfs_bmap_ptr_req *req)
 {
        return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
-static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
-                                    union nilfs_bmap_ptr_req *req)
+                             union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
-}
-static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
-                                       union nilfs_bmap_ptr_req *req)
 {
-        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
+        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
+                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
-static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
+void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
-                                   union nilfs_bmap_ptr_req *req)
+                            union nilfs_bmap_ptr_req *req)
 {
        nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
 }
@@ -566,128 +529,44 @@ int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
 }
-int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
+int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
-                              union nilfs_bmap_ptr_req *oldreq,
+                                union nilfs_bmap_ptr_req *oldreq,
-                              union nilfs_bmap_ptr_req *newreq)
+                                union nilfs_bmap_ptr_req *newreq)
 {
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
        int ret;
-        ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
+        ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
        if (ret < 0)
                return ret;
-        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
+        ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
        if (ret < 0)
-                bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+                nilfs_dat_abort_end(dat, &oldreq->bpr_req);
        return ret;
 }
-void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
+void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
-                              union nilfs_bmap_ptr_req *oldreq,
+                                union nilfs_bmap_ptr_req *oldreq,
-                              union nilfs_bmap_ptr_req *newreq)
+                                union nilfs_bmap_ptr_req *newreq)
 {
-        bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
-}
-void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
+        nilfs_dat_commit_end(dat, &oldreq->bpr_req,
-                             union nilfs_bmap_ptr_req *oldreq,
+                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-                             union nilfs_bmap_ptr_req *newreq)
+        nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
-{
-        bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
-        bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
 }
-static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
+void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
-                                  __u64 *ptrp)
+                               union nilfs_bmap_ptr_req *oldreq,
+                               union nilfs_bmap_ptr_req *newreq)
 {
-        sector_t blocknr;
+        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        int ret;
-        ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
-        if (ret < 0)
-                return ret;
-        if (ptrp != NULL)
-                *ptrp = blocknr;
-        return 0;
-}
-static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
+        nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-                                      union nilfs_bmap_ptr_req *req)
+        nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
-{
-        /* ignore target ptr */
-        req->bpr_ptr = bmap->b_last_allocated_ptr++;
-        return 0;
 }
-static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
-                                      union nilfs_bmap_ptr_req *req)
-{
-        /* do nothing */
-}
-static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
-                                     union nilfs_bmap_ptr_req *req)
-{
-        bmap->b_last_allocated_ptr--;
-}
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
-        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
-        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
-        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
-        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
-        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
-        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
-        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
-        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_v,
-        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
-        .bpop_translate         =       nilfs_bmap_translate_v,
-};
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
-        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_v,
-        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_v,
-        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_v,
-        .bpop_prepare_start_ptr =       nilfs_bmap_prepare_start_v,
-        .bpop_commit_start_ptr  =       nilfs_bmap_commit_start_v,
-        .bpop_abort_start_ptr   =       nilfs_bmap_abort_start_v,
-        .bpop_prepare_end_ptr   =       nilfs_bmap_prepare_end_v,
-        .bpop_commit_end_ptr    =       nilfs_bmap_commit_end_vmdt,
-        .bpop_abort_end_ptr     =       nilfs_bmap_abort_end_v,
-        .bpop_translate         =       nilfs_bmap_translate_v,
-};
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
-        .bpop_prepare_alloc_ptr =       nilfs_bmap_prepare_alloc_p,
-        .bpop_commit_alloc_ptr  =       nilfs_bmap_commit_alloc_p,
-        .bpop_abort_alloc_ptr   =       nilfs_bmap_abort_alloc_p,
-        .bpop_prepare_start_ptr =       NULL,
-        .bpop_commit_start_ptr  =       NULL,
-        .bpop_abort_start_ptr   =       NULL,
-        .bpop_prepare_end_ptr   =       NULL,
-        .bpop_commit_end_ptr    =       NULL,
-        .bpop_abort_end_ptr     =       NULL,
-        .bpop_translate         =       NULL,
-};
-static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
-        .bpop_prepare_alloc_ptr =       NULL,
-        .bpop_commit_alloc_ptr  =       NULL,
-        .bpop_abort_alloc_ptr   =       NULL,
-        .bpop_prepare_start_ptr =       NULL,
-        .bpop_commit_start_ptr  =       NULL,
-        .bpop_abort_start_ptr   =       NULL,
-        .bpop_prepare_end_ptr   =       NULL,
-        .bpop_commit_end_ptr    =       NULL,
-        .bpop_abort_end_ptr     =       NULL,
-        .bpop_translate         =       NULL,
-};
 static struct lock_class_key nilfs_bmap_dat_lock_key;
 /**
@@ -714,31 +593,26 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
        switch (bmap->b_inode->i_ino) {
        case NILFS_DAT_INO:
-                bmap->b_pops = &nilfs_bmap_ptr_ops_p;
+                bmap->b_ptr_type = NILFS_BMAP_PTR_P;
-                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
                lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
                break;
        case NILFS_CPFILE_INO:
        case NILFS_SUFILE_INO:
-                bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
+                bmap->b_ptr_type = NILFS_BMAP_PTR_VS;
-                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
                break;
        default:
-                bmap->b_pops = &nilfs_bmap_ptr_ops_v;
+                bmap->b_ptr_type = NILFS_BMAP_PTR_VM;
-                bmap->b_last_allocated_key = 0; /* XXX: use macro */
+                bmap->b_last_allocated_key = 0;
                bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
                break;
        }
        return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
-                nilfs_btree_init(bmap,
+                nilfs_btree_init(bmap) : nilfs_direct_init(bmap);
-                                 NILFS_BMAP_LARGE_LOW,
-                                 NILFS_BMAP_LARGE_HIGH) :
-                nilfs_direct_init(bmap,
-                                  NILFS_BMAP_SMALL_LOW,
-                                  NILFS_BMAP_SMALL_HIGH);
 }
 /**
@@ -764,7 +638,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
        memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
        init_rwsem(&bmap->b_sem);
        bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
-        bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
+        bmap->b_ptr_type = NILFS_BMAP_PTR_U;
        bmap->b_last_allocated_key = 0;
        bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
        bmap->b_state = 0;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 4f2708abb1ba..b2890cdcef12 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -64,6 +64,8 @@ struct nilfs_bmap_stats {
 */
 struct nilfs_bmap_operations {
        int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+        int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *,
+                                 unsigned);
        int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
        int (*bop_delete)(struct nilfs_bmap *, __u64);
        void (*bop_clear)(struct nilfs_bmap *);
@@ -86,34 +88,6 @@ struct nilfs_bmap_operations {
 };
-/**
- * struct nilfs_bmap_ptr_operations - bmap ptr operation table
- */
-struct nilfs_bmap_ptr_operations {
-        int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *);
-        void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *);
-        void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
-                                     union nilfs_bmap_ptr_req *);
-        int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *);
-        void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
-                                      union nilfs_bmap_ptr_req *,
-                                      sector_t);
-        void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
-                                     union nilfs_bmap_ptr_req *);
-        int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
-                                    union nilfs_bmap_ptr_req *);
-        void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
-                                    union nilfs_bmap_ptr_req *);
-        void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
-                                   union nilfs_bmap_ptr_req *);
-        int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
-};
 #define NILFS_BMAP_SIZE         (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
 #define NILFS_BMAP_KEY_BIT      (sizeof(unsigned long) * 8 /* CHAR_BIT */)
 #define NILFS_BMAP_NEW_PTR_INIT \
@@ -131,11 +105,9 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
 * @b_sem: semaphore
 * @b_inode: owner of bmap
 * @b_ops: bmap operation table
- * @b_pops: bmap ptr operation table
- * @b_low: low watermark of conversion
- * @b_high: high watermark of conversion
 * @b_last_allocated_key: last allocated key for data block
 * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_ptr_type: pointer type
 * @b_state: state
 */
 struct nilfs_bmap {
@@ -146,14 +118,22 @@ struct nilfs_bmap {
        struct rw_semaphore b_sem;
        struct inode *b_inode;
        const struct nilfs_bmap_operations *b_ops;
-        const struct nilfs_bmap_ptr_operations *b_pops;
-        __u64 b_low;
-        __u64 b_high;
        __u64 b_last_allocated_key;
        __u64 b_last_allocated_ptr;
+        int b_ptr_type;
        int b_state;
 };
+/* pointer type */
+#define NILFS_BMAP_PTR_P        0       /* physical block number (i.e. LBN) */
+#define NILFS_BMAP_PTR_VS       1       /* virtual block number (single
+                                           version) */
+#define NILFS_BMAP_PTR_VM       2       /* virtual block number (has multiple
+                                           versions) */
+#define NILFS_BMAP_PTR_U        (-1)    /* never perform pointer operations */
+#define NILFS_BMAP_USE_VBN(bmap)        ((bmap)->b_ptr_type > 0)
 /* state */
 #define NILFS_BMAP_DIRTY        0x00000001
@@ -162,6 +142,7 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
 int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
+int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
 int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
 int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
@@ -182,7 +163,67 @@ void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 /*
 * Internal use only
 */
+struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
+int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
+                               union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
+                               union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
+                              union nilfs_bmap_ptr_req *);
+static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
+                                               union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                return nilfs_bmap_prepare_alloc_v(bmap, req);
+        /* ignore target ptr */
+        req->bpr_ptr = bmap->b_last_allocated_ptr++;
+        return 0;
+}
+static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
+                                               union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_commit_alloc_v(bmap, req);
+}
+static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
+                                              union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_abort_alloc_v(bmap, req);
+        else
+                bmap->b_last_allocated_ptr--;
+}
+int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
+static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
+                                             union nilfs_bmap_ptr_req *req)
+{
+        return NILFS_BMAP_USE_VBN(bmap) ?
+                nilfs_bmap_prepare_end_v(bmap, req) : 0;
+}
+static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
+                                             union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_commit_end_v(bmap, req);
+}
+static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
+                                            union nilfs_bmap_ptr_req *req)
+{
+        if (NILFS_BMAP_USE_VBN(bmap))
+                nilfs_bmap_abort_end_v(bmap, req);
+}
+int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
+                       sector_t);
 int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
 int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
@@ -193,28 +234,20 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_update(struct nilfs_bmap *,
+int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
-                              union nilfs_bmap_ptr_req *,
+                                union nilfs_bmap_ptr_req *,
-                              union nilfs_bmap_ptr_req *);
+                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update(struct nilfs_bmap *,
+void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
-                              union nilfs_bmap_ptr_req *,
+                                union nilfs_bmap_ptr_req *,
-                              union nilfs_bmap_ptr_req *);
+                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update(struct nilfs_bmap *,
+void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
-                             union nilfs_bmap_ptr_req *,
+                               union nilfs_bmap_ptr_req *,
-                             union nilfs_bmap_ptr_req *);
+                               union nilfs_bmap_ptr_req *);
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
-int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
-                         struct buffer_head **);
-void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
-int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
-                             struct buffer_head **);
-void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
 /* Assume that bmap semaphore is locked. */
 static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
 {
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4cc07b2c30e0..7e0b61be212e 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -46,15 +46,18 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
        INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
 }
-static struct address_space_operations def_btnode_aops;
+static struct address_space_operations def_btnode_aops = {
+        .sync_page              = block_sync_page,
+};
-void nilfs_btnode_cache_init(struct address_space *btnc)
+void nilfs_btnode_cache_init(struct address_space *btnc,
+                             struct backing_dev_info *bdi)
 {
        btnc->host = NULL;  /* can safely set to host inode ? */
        btnc->flags = 0;
        mapping_set_gfp_mask(btnc, GFP_NOFS);
        btnc->assoc_mapping = NULL;
-        btnc->backing_dev_info = &default_backing_dev_info;
+        btnc->backing_dev_info = bdi;
        btnc->a_ops = &def_btnode_aops;
 }
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 35faa86444a7..3e2275172ed6 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -38,7 +38,7 @@ struct nilfs_btnode_chkey_ctxt {
 };
 void nilfs_btnode_cache_init_once(struct address_space *);
-void nilfs_btnode_cache_init(struct address_space *);
+void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
                              struct buffer_head **, int);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 6b37a2767293..aa412724b64e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -29,6 +29,7 @@
 #include "btnode.h"
 #include "btree.h"
 #include "alloc.h"
+#include "dat.h"
 /**
 * struct nilfs_btree_path - A path on which B-tree operations are executed
@@ -109,8 +110,7 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
             level < NILFS_BTREE_LEVEL_MAX;
             level++) {
                if (path[level].bp_bh != NULL) {
-                        nilfs_bmap_put_block(&btree->bt_bmap,
+                        brelse(path[level].bp_bh);
-                                             path[level].bp_bh);
                        path[level].bp_bh = NULL;
                }
                /* sib_bh is released or deleted by prepare or commit
@@ -123,10 +123,29 @@ static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
        }
 }
 /*
 * B-tree node operations
 */
+static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr,
+                                 struct buffer_head **bhp)
+{
+        struct address_space *btnc =
+                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
+        return nilfs_btnode_get(btnc, ptr, 0, bhp, 0);
+}
+static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
+                                     __u64 ptr, struct buffer_head **bhp)
+{
+        struct address_space *btnc =
+                &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
+        int ret;
+        ret = nilfs_btnode_get(btnc, ptr, 0, bhp, 1);
+        if (!ret)
+                set_buffer_nilfs_volatile(*bhp);
+        return ret;
+}
 static inline int
 nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
@@ -488,8 +507,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
        path[level].bp_index = index;
        for (level--; level >= minlevel; level--) {
-                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
-                                           &path[level].bp_bh);
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -535,8 +553,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
        path[level].bp_index = index;
        for (level--; level > 0; level--) {
-                ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
-                                           &path[level].bp_bh);
                if (ret < 0)
                        return ret;
                node = nilfs_btree_get_nonroot_node(btree, path, level);
@@ -579,6 +596,87 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
        return ret;
 }
+static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
+                                     __u64 key, __u64 *ptrp, unsigned maxblocks)
+{
+        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+        struct nilfs_btree_path *path;
+        struct nilfs_btree_node *node;
+        struct inode *dat = NULL;
+        __u64 ptr, ptr2;
+        sector_t blocknr;
+        int level = NILFS_BTREE_LEVEL_NODE_MIN;
+        int ret, cnt, index, maxlevel;
+        path = nilfs_btree_alloc_path(btree);
+        if (path == NULL)
+                return -ENOMEM;
+        nilfs_btree_init_path(btree, path);
+        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+        if (ret < 0)
+                goto out;
+        if (NILFS_BMAP_USE_VBN(bmap)) {
+                dat = nilfs_bmap_get_dat(bmap);
+                ret = nilfs_dat_translate(dat, ptr, &blocknr);
+                if (ret < 0)
+                        goto out;
+                ptr = blocknr;
+        }
+        cnt = 1;
+        if (cnt == maxblocks)
+                goto end;
+        maxlevel = nilfs_btree_height(btree) - 1;
+        node = nilfs_btree_get_node(btree, path, level);
+        index = path[level].bp_index + 1;
+        for (;;) {
+                while (index < nilfs_btree_node_get_nchildren(btree, node)) {
+                        if (nilfs_btree_node_get_key(btree, node, index) !=
+                            key + cnt)
+                                goto end;
+                        ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                        if (dat) {
+                                ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+                                if (ret < 0)
+                                        goto out;
+                                ptr2 = blocknr;
+                        }
+                        if (ptr2 != ptr + cnt || ++cnt == maxblocks)
+                                goto end;
+                        index++;
+                        continue;
+                }
+                if (level == maxlevel)
+                        break;
+                /* look-up right sibling node */
+                node = nilfs_btree_get_node(btree, path, level + 1);
+                index = path[level + 1].bp_index + 1;
+                if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
+                    nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+                        break;
+                ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
+                path[level + 1].bp_index = index;
+                brelse(path[level].bp_bh);
+                path[level].bp_bh = NULL;
+                ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
+                if (ret < 0)
+                        goto out;
+                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                index = 0;
+                path[level].bp_index = index;
+        }
+ end:
+        *ptrp = ptr;
+        ret = cnt;
+ out:
+        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_free_path(btree, path);
+        return ret;
+}
 static void nilfs_btree_promote_key(struct nilfs_btree *btree,
                                    struct nilfs_btree_path *path,
                                    int level, __u64 key)
@@ -669,13 +767,13 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
                                nilfs_btree_node_get_key(btree, node, 0));
        if (move) {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index += lnchildren;
                path[level + 1].bp_index--;
        } else {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
                path[level].bp_index -= n;
        }
@@ -722,14 +820,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        path[level + 1].bp_index--;
        if (move) {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
                path[level].bp_index -=
                        nilfs_btree_node_get_nchildren(btree, node);
                path[level + 1].bp_index++;
        } else {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
        }
@@ -781,7 +879,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
                *keyp = nilfs_btree_node_get_key(btree, right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
        } else {
@@ -790,7 +888,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
                *keyp = nilfs_btree_node_get_key(btree, right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
                path[level].bp_sib_bh = NULL;
        }
@@ -897,12 +995,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        level = NILFS_BTREE_LEVEL_DATA;
        /* allocate a new ptr for data block */
-        if (btree->bt_ops->btop_find_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
                path[level].bp_newreq.bpr_ptr =
-                        btree->bt_ops->btop_find_target(btree, path, key);
+                        nilfs_btree_find_target_v(btree, path, key);
-        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                &btree->bt_bmap, &path[level].bp_newreq);
+                                           &path[level].bp_newreq);
        if (ret < 0)
                goto err_out_data;
@@ -924,8 +1022,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                if (pindex > 0) {
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex - 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -936,7 +1033,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                                stats->bs_nblocks++;
                                goto out;
                        } else
-                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                                brelse(bh);
                }
                /* right sibling */
@@ -944,8 +1041,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -956,19 +1052,19 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                                stats->bs_nblocks++;
                                goto out;
                        } else
-                                nilfs_bmap_put_block(&btree->bt_bmap, bh);
+                                brelse(bh);
                }
                /* split */
                path[level].bp_newreq.bpr_ptr =
                        path[level - 1].bp_newreq.bpr_ptr + 1;
-                ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+                ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                        &btree->bt_bmap, &path[level].bp_newreq);
+                                                   &path[level].bp_newreq);
                if (ret < 0)
                        goto err_out_child_node;
-                ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+                ret = nilfs_btree_get_new_block(btree,
-                                               path[level].bp_newreq.bpr_ptr,
+                                                path[level].bp_newreq.bpr_ptr,
-                                               &bh);
+                                                &bh);
                if (ret < 0)
                        goto err_out_curr_node;
@@ -994,12 +1090,12 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* grow */
        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
-        ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                &btree->bt_bmap, &path[level].bp_newreq);
+                                           &path[level].bp_newreq);
        if (ret < 0)
                goto err_out_child_node;
-        ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+        ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
-                                       path[level].bp_newreq.bpr_ptr, &bh);
+                                        &bh);
        if (ret < 0)
                goto err_out_curr_node;
@@ -1023,18 +1119,16 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
-                                                    &path[level].bp_newreq);
 err_out_child_node:
        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
-                nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                nilfs_btnode_delete(path[level].bp_sib_bh);
-                btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
+                nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
-                        &btree->bt_bmap, &path[level].bp_newreq);
+                                           &path[level].bp_newreq);
        }
-        btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
-                                                       &path[level].bp_newreq);
 err_out_data:
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1049,14 +1143,12 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-        if (btree->bt_ops->btop_set_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
-                btree->bt_ops->btop_set_target(btree, key, ptr);
+                nilfs_btree_set_target_v(btree, key, ptr);
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
+                nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
-                        btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
+                                            &path[level - 1].bp_newreq);
-                                &btree->bt_bmap, &path[level - 1].bp_newreq);
-                }
                path[level].bp_op(btree, path, level, &key, &ptr);
        }
@@ -1153,7 +1245,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        nilfs_btree_promote_key(btree, path, level + 1,
                                nilfs_btree_node_get_key(btree, node, 0));
-        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level].bp_index += n;
 }
@@ -1192,7 +1284,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
                                nilfs_btree_node_get_key(btree, right, 0));
        path[level + 1].bp_index--;
-        nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
 }
@@ -1221,7 +1313,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_bh);
        unlock_buffer(path[level].bp_sib_bh);
-        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
        path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
@@ -1252,7 +1344,7 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_bh);
        unlock_buffer(path[level].bp_sib_bh);
-        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+        nilfs_btnode_delete(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
        path[level + 1].bp_index++;
 }
@@ -1276,7 +1368,7 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
        nilfs_btree_node_move_left(btree, root, child, n);
        unlock_buffer(path[level].bp_bh);
-        nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = NULL;
 }
@@ -1300,12 +1392,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                path[level].bp_oldreq.bpr_ptr =
                        nilfs_btree_node_get_ptr(btree, node,
                                                 path[level].bp_index);
-                if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+                ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                        ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+                                                 &path[level].bp_oldreq);
-                                &btree->bt_bmap, &path[level].bp_oldreq);
+                if (ret < 0)
-                        if (ret < 0)
+                        goto err_out_child_node;
-                                goto err_out_child_node;
-                }
                if (nilfs_btree_node_get_nchildren(btree, node) >
                    nilfs_btree_node_nchildren_min(btree, node)) {
@@ -1321,8 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        /* left sibling */
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex - 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1343,8 +1432,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        /* right sibling */
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
-                        ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
-                                                   &bh);
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
@@ -1381,12 +1469,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        node = nilfs_btree_get_root(btree);
        path[level].bp_oldreq.bpr_ptr =
                nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
-        if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
-                ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+        ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                        &btree->bt_bmap, &path[level].bp_oldreq);
+                                         &path[level].bp_oldreq);
-                if (ret < 0)
+        if (ret < 0)
-                        goto err_out_child_node;
+                goto err_out_child_node;
-        }
        /* child of the root node is deleted */
        path[level].bp_op = nilfs_btree_do_delete;
        stats->bs_nblocks++;
@@ -1398,15 +1486,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+        nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
-                btree->bt_bmap.b_pops->bpop_abort_end_ptr(
-                        &btree->bt_bmap, &path[level].bp_oldreq);
 err_out_child_node:
        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
-                nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+                brelse(path[level].bp_sib_bh);
-                if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+                nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
-                        btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+                                         &path[level].bp_oldreq);
-                                &btree->bt_bmap, &path[level].bp_oldreq);
        }
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1420,9 +1505,8 @@ static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
        int level;
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
-                if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
+                nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
-                        btree->bt_bmap.b_pops->bpop_commit_end_ptr(
+                                          &path[level].bp_oldreq);
-                                &btree->bt_bmap, &path[level].bp_oldreq);
                path[level].bp_op(btree, path, level, NULL, NULL);
        }
@@ -1501,7 +1585,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
                if (nchildren > 1)
                        return 0;
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
-                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
@@ -1515,9 +1599,9 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
        nextmaxkey = (nchildren > 1) ?
                nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
        if (bh != NULL)
-                nilfs_bmap_put_block(bmap, bh);
+                brelse(bh);
-        return (maxkey == key) && (nextmaxkey < bmap->b_low);
+        return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
 }
 static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
@@ -1542,7 +1626,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
                nchildren = nilfs_btree_node_get_nchildren(btree, root);
                WARN_ON(nchildren > 1);
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
-                ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+                ret = nilfs_btree_get_block(btree, ptr, &bh);
                if (ret < 0)
                        return ret;
                node = (struct nilfs_btree_node *)bh->b_data;
@@ -1563,7 +1647,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
        }
        if (bh != NULL)
-                nilfs_bmap_put_block(bmap, bh);
+                brelse(bh);
        return nitems;
 }
@@ -1584,10 +1668,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* for data */
        /* cannot find near ptr */
-        if (btree->bt_ops->btop_find_target != NULL)
+        if (NILFS_BMAP_USE_VBN(bmap))
-                dreq->bpr_ptr
+                dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
-                        = btree->bt_ops->btop_find_target(btree, NULL, key);
-        ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
+        ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
        if (ret < 0)
                return ret;
@@ -1595,11 +1679,11 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        stats->bs_nblocks++;
        if (nreq != NULL) {
                nreq->bpr_ptr = dreq->bpr_ptr + 1;
-                ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
+                ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
                if (ret < 0)
                        goto err_out_dreq;
-                ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
+                ret = nilfs_btree_get_new_block(btree, nreq->bpr_ptr, &bh);
                if (ret < 0)
                        goto err_out_nreq;
@@ -1612,9 +1696,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* error */
 err_out_nreq:
-        bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, nreq);
 err_out_dreq:
-        bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, dreq);
        stats->bs_nblocks = 0;
        return ret;
@@ -1624,7 +1708,7 @@ static void
 nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                                      __u64 key, __u64 ptr,
                                      const __u64 *keys, const __u64 *ptrs,
-                                      int n, __u64 low, __u64 high,
+                                      int n,
                                      union nilfs_bmap_ptr_req *dreq,
                                      union nilfs_bmap_ptr_req *nreq,
                                      struct buffer_head *bh)
@@ -1642,12 +1726,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
        /* convert and insert */
        btree = (struct nilfs_btree *)bmap;
-        nilfs_btree_init(bmap, low, high);
+        nilfs_btree_init(bmap);
        if (nreq != NULL) {
-                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
-                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+                nilfs_bmap_commit_alloc_ptr(bmap, nreq);
-                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
-                }
                /* create child node at level 1 */
                lock_buffer(bh);
@@ -1661,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                        nilfs_bmap_set_dirty(bmap);
                unlock_buffer(bh);
-                nilfs_bmap_put_block(bmap, bh);
+                brelse(bh);
                /* create root node at level 2 */
                node = nilfs_btree_get_root(btree);
@@ -1669,8 +1751,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
                                      2, 1, &keys[0], &tmpptr);
        } else {
-                if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
-                        bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
                /* create root node at level 1 */
                node = nilfs_btree_get_root(btree);
@@ -1682,8 +1763,8 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                        nilfs_bmap_set_dirty(bmap);
        }
-        if (btree->bt_ops->btop_set_target != NULL)
+        if (NILFS_BMAP_USE_VBN(bmap))
-                btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
+                nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr);
 }
 /**
@@ -1694,13 +1775,10 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
 * @keys:
 * @ptrs:
 * @n:
- * @low:
- * @high:
 */
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
                                   __u64 key, __u64 ptr,
-                                   const __u64 *keys, const __u64 *ptrs,
+                                   const __u64 *keys, const __u64 *ptrs, int n)
-                                   int n, __u64 low, __u64 high)
 {
        struct buffer_head *bh;
        union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
@@ -1725,7 +1803,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
        if (ret < 0)
                return ret;
        nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
-                                              low, high, di, ni, bh);
+                                              di, ni, bh);
        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
        return 0;
 }
@@ -1754,9 +1832,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                nilfs_btree_node_get_ptr(btree, parent,
                                         path[level + 1].bp_index);
        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
-        ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
+        ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
-                                        &path[level].bp_oldreq,
+                                          &path[level].bp_oldreq,
-                                        &path[level].bp_newreq);
+                                          &path[level].bp_newreq);
        if (ret < 0)
                return ret;
@@ -1768,9 +1846,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
                        &path[level].bp_ctxt);
                if (ret < 0) {
-                        nilfs_bmap_abort_update(&btree->bt_bmap,
+                        nilfs_bmap_abort_update_v(&btree->bt_bmap,
-                                                &path[level].bp_oldreq,
+                                                  &path[level].bp_oldreq,
-                                                &path[level].bp_newreq);
+                                                  &path[level].bp_newreq);
                        return ret;
                }
        }
@@ -1784,9 +1862,9 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 {
        struct nilfs_btree_node *parent;
-        nilfs_bmap_commit_update(&btree->bt_bmap,
+        nilfs_bmap_commit_update_v(&btree->bt_bmap,
-                                 &path[level].bp_oldreq,
+                                   &path[level].bp_oldreq,
-                                 &path[level].bp_newreq);
+                                   &path[level].bp_newreq);
        if (buffer_nilfs_node(path[level].bp_bh)) {
                nilfs_btnode_commit_change_key(
@@ -1805,9 +1883,9 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
                                       struct nilfs_btree_path *path,
                                       int level)
 {
-        nilfs_bmap_abort_update(&btree->bt_bmap,
+        nilfs_bmap_abort_update_v(&btree->bt_bmap,
-                                &path[level].bp_oldreq,
+                                  &path[level].bp_oldreq,
-                                &path[level].bp_newreq);
+                                  &path[level].bp_newreq);
        if (buffer_nilfs_node(path[level].bp_bh))
                nilfs_btnode_abort_change_key(
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1930,7 +2008,9 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                goto out;
        }
-        ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
+        ret = NILFS_BMAP_USE_VBN(bmap) ?
+                nilfs_btree_propagate_v(btree, path, level, bh) :
+                nilfs_btree_propagate_p(btree, path, level, bh);
 out:
        nilfs_btree_clear_path(btree, path);
@@ -2066,12 +2146,9 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
        ptr = nilfs_btree_node_get_ptr(btree, parent,
                                       path[level + 1].bp_index);
        req.bpr_ptr = ptr;
-        ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
+        ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
-                                                               &req);
+        if (unlikely(ret < 0))
-        if (ret < 0)
                return ret;
-        btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
-                                                        &req, blocknr);
        key = nilfs_btree_node_get_key(btree, parent,
                                       path[level + 1].bp_index);
@@ -2114,8 +2191,9 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
                goto out;
        }
-        ret = btree->bt_ops->btop_assign(btree, path, level, bh,
+        ret = NILFS_BMAP_USE_VBN(bmap) ?
-                                            blocknr, binfo);
+                nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
+                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 out:
        nilfs_btree_clear_path(btree, path);
@@ -2171,7 +2249,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
                WARN_ON(ret == -ENOENT);
                goto out;
        }
-        ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
+        ret = nilfs_btree_get_block(btree, ptr, &bh);
        if (ret < 0) {
                WARN_ON(ret == -ENOENT);
                goto out;
@@ -2179,7 +2257,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        if (!buffer_dirty(bh))
                nilfs_btnode_mark_dirty(bh);
-        nilfs_bmap_put_block(&btree->bt_bmap, bh);
+        brelse(bh);
        if (!nilfs_bmap_dirty(&btree->bt_bmap))
                nilfs_bmap_set_dirty(&btree->bt_bmap);
@@ -2191,6 +2269,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
 static const struct nilfs_bmap_operations nilfs_btree_ops = {
        .bop_lookup             =       nilfs_btree_lookup,
+        .bop_lookup_contig      =       nilfs_btree_lookup_contig,
        .bop_insert             =       nilfs_btree_insert,
        .bop_delete             =       nilfs_btree_delete,
        .bop_clear              =       NULL,
@@ -2210,6 +2289,7 @@ static const struct nilfs_bmap_operations nilfs_btree_ops = {
 static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
        .bop_lookup             =       NULL,
+        .bop_lookup_contig      =       NULL,
        .bop_insert             =       NULL,
        .bop_delete             =       NULL,
        .bop_clear              =       NULL,
@@ -2227,43 +2307,13 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
        .bop_gather_data        =       NULL,
 };
-static const struct nilfs_btree_operations nilfs_btree_ops_v = {
+int nilfs_btree_init(struct nilfs_bmap *bmap)
-        .btop_find_target       =       nilfs_btree_find_target_v,
-        .btop_set_target        =       nilfs_btree_set_target_v,
-        .btop_propagate         =       nilfs_btree_propagate_v,
-        .btop_assign            =       nilfs_btree_assign_v,
-};
-static const struct nilfs_btree_operations nilfs_btree_ops_p = {
-        .btop_find_target       =       NULL,
-        .btop_set_target        =       NULL,
-        .btop_propagate         =       nilfs_btree_propagate_p,
-        .btop_assign            =       nilfs_btree_assign_p,
-};
-int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
 {
-        struct nilfs_btree *btree;
-        btree = (struct nilfs_btree *)bmap;
        bmap->b_ops = &nilfs_btree_ops;
-        bmap->b_low = low;
-        bmap->b_high = high;
-        switch (bmap->b_inode->i_ino) {
-        case NILFS_DAT_INO:
-                btree->bt_ops = &nilfs_btree_ops_p;
-                break;
-        default:
-                btree->bt_ops = &nilfs_btree_ops_v;
-                break;
-        }
        return 0;
 }
 void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
 {
-        bmap->b_low = NILFS_BMAP_LARGE_LOW;
-        bmap->b_high = NILFS_BMAP_LARGE_HIGH;
        bmap->b_ops = &nilfs_btree_ops_gc;
 }
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 4766deb52fb1..0e72bbbc6b64 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -34,28 +34,6 @@ struct nilfs_btree;
 struct nilfs_btree_path;
 /**
- * struct nilfs_btree_operations - B-tree operation table
- */
-struct nilfs_btree_operations {
-        __u64 (*btop_find_target)(const struct nilfs_btree *,
-                                  const struct nilfs_btree_path *, __u64);
-        void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
-        struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
-        int (*btop_propagate)(struct nilfs_btree *,
-                              struct nilfs_btree_path *,
-                              int,
-                              struct buffer_head *);
-        int (*btop_assign)(struct nilfs_btree *,
-                           struct nilfs_btree_path *,
-                           int,
-                           struct buffer_head **,
-                           sector_t,
-                           union nilfs_binfo *);
-};
-/**
 * struct nilfs_btree_node - B-tree node
 * @bn_flags: flags
 * @bn_level: level
@@ -80,13 +58,9 @@ struct nilfs_btree_node {
 /**
 * struct nilfs_btree - B-tree structure
 * @bt_bmap: bmap base structure
- * @bt_ops: B-tree operation table
 */
 struct nilfs_btree {
        struct nilfs_bmap bt_bmap;
-        /* B-tree-specific members */
-        const struct nilfs_btree_operations *bt_ops;
 };
@@ -108,10 +82,9 @@ struct nilfs_btree {
 int nilfs_btree_path_cache_init(void);
 void nilfs_btree_path_cache_destroy(void);
-int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_btree_init(struct nilfs_bmap *);
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
-                                   const __u64 *, const __u64 *,
+                                   const __u64 *, const __u64 *, int);
-                                   int, __u64, __u64);
 void nilfs_btree_init_gc(struct nilfs_bmap *);
 #endif  /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 300f1cdfa862..7d49813f66d6 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -295,10 +295,6 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                return -EINVAL;
        }
-        /* cannot delete the latest checkpoint */
-        if (start == nilfs_mdt_cno(cpfile) - 1)
-                return -EPERM;
        down_write(&NILFS_MDT(cpfile)->mi_sem);
        ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
@@ -384,9 +380,10 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
 }
 static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
-                                          struct nilfs_cpinfo *ci, size_t nci)
+                                          void *buf, unsigned cisz, size_t nci)
 {
        struct nilfs_checkpoint *cp;
+        struct nilfs_cpinfo *ci = buf;
        struct buffer_head *bh;
        size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
        __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
@@ -410,17 +407,22 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
                kaddr = kmap_atomic(bh->b_page, KM_USER0);
                cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
                for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
-                        if (!nilfs_checkpoint_invalid(cp))
+                        if (!nilfs_checkpoint_invalid(cp)) {
-                                nilfs_cpfile_checkpoint_to_cpinfo(
+                                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp,
-                                        cpfile, cp, &ci[n++]);
+                                                                  ci);
+                                ci = (void *)ci + cisz;
+                                n++;
+                        }
                }
                kunmap_atomic(kaddr, KM_USER0);
                brelse(bh);
        }
        ret = n;
-        if (n > 0)
+        if (n > 0) {
-                *cnop = ci[n - 1].ci_cno + 1;
+                ci = (void *)ci - cisz;
+                *cnop = ci->ci_cno + 1;
+        }
 out:
        up_read(&NILFS_MDT(cpfile)->mi_sem);
@@ -428,11 +430,12 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 }
 static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
-                                          struct nilfs_cpinfo *ci, size_t nci)
+                                          void *buf, unsigned cisz, size_t nci)
 {
        struct buffer_head *bh;
        struct nilfs_cpfile_header *header;
        struct nilfs_checkpoint *cp;
+        struct nilfs_cpinfo *ci = buf;
        __u64 curr = *cnop, next;
        unsigned long curr_blkoff, next_blkoff;
        void *kaddr;
@@ -472,7 +475,9 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
                if (unlikely(nilfs_checkpoint_invalid(cp) ||
                             !nilfs_checkpoint_snapshot(cp)))
                        break;
-                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
+                nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci);
+                ci = (void *)ci + cisz;
+                n++;
                next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
                if (next == 0)
                        break; /* reach end of the snapshot list */
@@ -511,13 +516,13 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 */
 ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
-                                struct nilfs_cpinfo *ci, size_t nci)
+                                void *buf, unsigned cisz, size_t nci)
 {
        switch (mode) {
        case NILFS_CHECKPOINT:
-                return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
+                return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci);
        case NILFS_SNAPSHOT:
-                return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
+                return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci);
        default:
                return -EINVAL;
        }
@@ -533,20 +538,14 @@ int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
        struct nilfs_cpinfo ci;
        __u64 tcno = cno;
        ssize_t nci;
-        int ret;
-        nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
+        nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1);
        if (nci < 0)
                return nci;
        else if (nci == 0 || ci.ci_cno != cno)
                return -ENOENT;
+        else if (nilfs_cpinfo_snapshot(&ci))
-        /* cannot delete the latest checkpoint nor snapshots */
+                return -EBUSY;
-        ret = nilfs_cpinfo_snapshot(&ci);
-        if (ret < 0)
-                return ret;
-        else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
-                return -EPERM;
        return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
 }
@@ -864,11 +863,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
        case NILFS_CHECKPOINT:
                /*
                 * Check for protecting existing snapshot mounts:
-                 * bd_mount_sem is used to make this operation atomic and
+                 * ns_mount_mutex is used to make this operation atomic and
                 * exclusive with a new mount job.  Though it doesn't cover
                 * umount, it's enough for the purpose.
                 */
-                down(&nilfs->ns_bdev->bd_mount_sem);
+                mutex_lock(&nilfs->ns_mount_mutex);
                if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
                        /* Current implementation does not have to protect
                           plain read-only mounts since they are exclusive
@@ -877,7 +876,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
                        ret = -EBUSY;
                } else
                        ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
-                up(&nilfs->ns_bdev->bd_mount_sem);
+                mutex_unlock(&nilfs->ns_mount_mutex);
                return ret;
        case NILFS_SNAPSHOT:
                return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 1a8a1008c342..788a45950197 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -39,7 +39,7 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
 int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
 int nilfs_cpfile_is_snapshot(struct inode *, __u64);
 int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
-ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned,
-                                struct nilfs_cpinfo *, size_t);
+                                size_t);
 #endif  /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index bb8a5818e7f1..0b2710e2d565 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -92,21 +92,6 @@ void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
        nilfs_palloc_abort_alloc_entry(dat, req);
 }
-int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
-        int ret;
-        ret = nilfs_palloc_prepare_free_entry(dat, req);
-        if (ret < 0)
-                return ret;
-        ret = nilfs_dat_prepare_entry(dat, req, 0);
-        if (ret < 0) {
-                nilfs_palloc_abort_free_entry(dat, req);
-                return ret;
-        }
-        return 0;
-}
 void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
 {
        struct nilfs_dat_entry *entry;
@@ -391,36 +376,37 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
        return ret;
 }
-ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz,
                            size_t nvi)
 {
        struct buffer_head *entry_bh;
        struct nilfs_dat_entry *entry;
+        struct nilfs_vinfo *vinfo = buf;
        __u64 first, last;
        void *kaddr;
        unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
        int i, j, n, ret;
        for (i = 0; i < nvi; i += n) {
-                ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
+                ret = nilfs_palloc_get_entry_block(dat, vinfo->vi_vblocknr,
                                                   0, &entry_bh);
                if (ret < 0)
                        return ret;
                kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
                /* last virtual block number in this block */
-                first = vinfo[i].vi_vblocknr;
+                first = vinfo->vi_vblocknr;
                do_div(first, entries_per_block);
                first *= entries_per_block;
                last = first + entries_per_block - 1;
                for (j = i, n = 0;
-                     j < nvi && vinfo[j].vi_vblocknr >= first &&
+                     j < nvi && vinfo->vi_vblocknr >= first &&
-                             vinfo[j].vi_vblocknr <= last;
+                             vinfo->vi_vblocknr <= last;
-                     j++, n++) {
+                     j++, n++, vinfo = (void *)vinfo + visz) {
                        entry = nilfs_palloc_block_get_entry(
-                                dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
+                                dat, vinfo->vi_vblocknr, entry_bh, kaddr);
-                        vinfo[j].vi_start = le64_to_cpu(entry->de_start);
+                        vinfo->vi_start = le64_to_cpu(entry->de_start);
-                        vinfo[j].vi_end = le64_to_cpu(entry->de_end);
+                        vinfo->vi_end = le64_to_cpu(entry->de_end);
-                        vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
+                        vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
                }
                kunmap_atomic(kaddr, KM_USER0);
                brelse(entry_bh);
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d9560654a4b7..d328b81eead4 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -47,6 +47,6 @@ void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
 int nilfs_dat_move(struct inode *, __u64, sector_t);
-ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t);
 #endif  /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index c6379e482781..342d9765df8d 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -25,6 +25,7 @@
 #include "page.h"
 #include "direct.h"
 #include "alloc.h"
+#include "dat.h"
 static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
 {
@@ -62,6 +63,47 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
        return 0;
 }
+static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
+                                      __u64 key, __u64 *ptrp,
+                                      unsigned maxblocks)
+{
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+        struct inode *dat = NULL;
+        __u64 ptr, ptr2;
+        sector_t blocknr;
+        int ret, cnt;
+        if (key > NILFS_DIRECT_KEY_MAX ||
+            (ptr = nilfs_direct_get_ptr(direct, key)) ==
+            NILFS_BMAP_INVALID_PTR)
+                return -ENOENT;
+        if (NILFS_BMAP_USE_VBN(bmap)) {
+                dat = nilfs_bmap_get_dat(bmap);
+                ret = nilfs_dat_translate(dat, ptr, &blocknr);
+                if (ret < 0)
+                        return ret;
+                ptr = blocknr;
+        }
+        maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1);
+        for (cnt = 1; cnt < maxblocks &&
+                     (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) !=
+                     NILFS_BMAP_INVALID_PTR;
+             cnt++) {
+                if (dat) {
+                        ret = nilfs_dat_translate(dat, ptr2, &blocknr);
+                        if (ret < 0)
+                                return ret;
+                        ptr2 = blocknr;
+                }
+                if (ptr2 != ptr + cnt)
+                        break;
+        }
+        *ptrp = ptr;
+        return cnt;
+}
 static __u64
 nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
 {
@@ -90,10 +132,9 @@ static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
 {
        int ret;
-        if (direct->d_ops->dop_find_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
+                req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
-        ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
+        ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
-                                                               req);
        if (ret < 0)
                return ret;
@@ -111,16 +152,14 @@ static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
        bh = (struct buffer_head *)((unsigned long)ptr);
        set_buffer_nilfs_volatile(bh);
-        if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
+        nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
-                direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
-                        &direct->d_bmap, req);
        nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
        if (!nilfs_bmap_dirty(&direct->d_bmap))
                nilfs_bmap_set_dirty(&direct->d_bmap);
-        if (direct->d_ops->dop_set_target != NULL)
+        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
+                nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
 }
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
@@ -152,25 +191,18 @@ static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
 {
        int ret;
-        if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+        req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
-                req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+        ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
-                ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
+        if (!ret)
-                        &direct->d_bmap, req);
+                stats->bs_nblocks = 1;
-                if (ret < 0)
+        return ret;
-                        return ret;
-        }
-        stats->bs_nblocks = 1;
-        return 0;
 }
 static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
                                       union nilfs_bmap_ptr_req *req,
                                       __u64 key)
 {
-        if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
+        nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
-                direct->d_bmap.b_pops->bpop_commit_end_ptr(
-                        &direct->d_bmap, req);
        nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
 }
@@ -244,8 +276,7 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
 }
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
-                                    __u64 key, __u64 *keys, __u64 *ptrs,
+                                    __u64 key, __u64 *keys, __u64 *ptrs, int n)
-                                    int n, __u64 low, __u64 high)
 {
        struct nilfs_direct *direct;
        __le64 *dptrs;
@@ -275,8 +306,7 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
                        dptrs[i] = NILFS_BMAP_INVALID_PTR;
        }
-        nilfs_direct_init(bmap, low, high);
+        nilfs_direct_init(bmap);
        return 0;
 }
@@ -293,11 +323,11 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
        if (!buffer_nilfs_volatile(bh)) {
                oldreq.bpr_ptr = ptr;
                newreq.bpr_ptr = ptr;
-                ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
+                ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
-                                                &newreq);
+                                                  &newreq);
                if (ret < 0)
                        return ret;
-                nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
+                nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
                set_buffer_nilfs_volatile(bh);
                nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
        } else
@@ -309,12 +339,10 @@ static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
 static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
                                  struct buffer_head *bh)
 {
-        struct nilfs_direct *direct;
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
-        direct = (struct nilfs_direct *)bmap;
+        return NILFS_BMAP_USE_VBN(bmap) ?
-        return (direct->d_ops->dop_propagate != NULL) ?
+                nilfs_direct_propagate_v(direct, bh) : 0;
-                direct->d_ops->dop_propagate(direct, bh) :
-                0;
 }
 static int nilfs_direct_assign_v(struct nilfs_direct *direct,
@@ -327,12 +355,9 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
        int ret;
        req.bpr_ptr = ptr;
-        ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
+        ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
-                &direct->d_bmap, &req);
+        if (unlikely(ret < 0))
-        if (ret < 0)
                return ret;
-        direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
-                                                     &req, blocknr);
        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -377,12 +402,14 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
                return -EINVAL;
        }
-        return direct->d_ops->dop_assign(direct, key, ptr, bh,
+        return NILFS_BMAP_USE_VBN(bmap) ?
-                                         blocknr, binfo);
+                nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) :
+                nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo);
 }
 static const struct nilfs_bmap_operations nilfs_direct_ops = {
        .bop_lookup             =       nilfs_direct_lookup,
+        .bop_lookup_contig      =       nilfs_direct_lookup_contig,
        .bop_insert             =       nilfs_direct_insert,
        .bop_delete             =       nilfs_direct_delete,
        .bop_clear              =       NULL,
@@ -401,36 +428,8 @@ static const struct nilfs_bmap_operations nilfs_direct_ops = {
 };
-static const struct nilfs_direct_operations nilfs_direct_ops_v = {
+int nilfs_direct_init(struct nilfs_bmap *bmap)
-        .dop_find_target        =       nilfs_direct_find_target_v,
-        .dop_set_target         =       nilfs_direct_set_target_v,
-        .dop_propagate          =       nilfs_direct_propagate_v,
-        .dop_assign             =       nilfs_direct_assign_v,
-};
-static const struct nilfs_direct_operations nilfs_direct_ops_p = {
-        .dop_find_target        =       NULL,
-        .dop_set_target         =       NULL,
-        .dop_propagate          =       NULL,
-        .dop_assign             =       nilfs_direct_assign_p,
-};
-int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
 {
-        struct nilfs_direct *direct;
-        direct = (struct nilfs_direct *)bmap;
        bmap->b_ops = &nilfs_direct_ops;
-        bmap->b_low = low;
-        bmap->b_high = high;
-        switch (bmap->b_inode->i_ino) {
-        case NILFS_DAT_INO:
-                direct->d_ops = &nilfs_direct_ops_p;
-                break;
-        default:
-                direct->d_ops = &nilfs_direct_ops_v;
-                break;
-        }
        return 0;
 }
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index 45d2c5cda812..a5ffd66e25d0 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -31,18 +31,6 @@
 struct nilfs_direct;
 /**
- * struct nilfs_direct_operations - direct mapping operation table
- */
-struct nilfs_direct_operations {
-        __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
-        void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
-        int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
-        int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
-                          struct buffer_head **, sector_t,
-                          union nilfs_binfo *);
-};
-/**
 * struct nilfs_direct_node - direct node
 * @dn_flags: flags
 * @dn_pad: padding
@@ -55,13 +43,9 @@ struct nilfs_direct_node {
 /**
 * struct nilfs_direct - direct mapping
 * @d_bmap: bmap structure
- * @d_ops: direct mapping operation table
 */
 struct nilfs_direct {
        struct nilfs_bmap d_bmap;
-        /* direct-mapping-specific members */
-        const struct nilfs_direct_operations *d_ops;
 };
@@ -70,9 +54,9 @@ struct nilfs_direct {
 #define NILFS_DIRECT_KEY_MAX    (NILFS_DIRECT_NBLOCKS - 1)
-int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_direct_init(struct nilfs_bmap *);
 int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
-                                    __u64 *, int, __u64, __u64);
+                                    __u64 *, int);
 #endif  /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 19d2102b6a69..1b3c2bb20da9 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,8 +52,9 @@
 #include "dat.h"
 #include "ifile.h"
-static struct address_space_operations def_gcinode_aops = {};
+static struct address_space_operations def_gcinode_aops = {
-/* XXX need def_gcinode_iops/fops? */
+        .sync_page              = block_sync_page,
+};
 /*
 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 49ab4a49bb4f..2696d6b513b7 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -43,22 +43,23 @@
 *
 * This function does not issue actual read request of the specified data
 * block. It is done by VFS.
- * Bulk read for direct-io is not supported yet. (should be supported)
 */
 int nilfs_get_block(struct inode *inode, sector_t blkoff,
                    struct buffer_head *bh_result, int create)
 {
        struct nilfs_inode_info *ii = NILFS_I(inode);
-        unsigned long blknum = 0;
+        __u64 blknum = 0;
        int err = 0, ret;
        struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+        unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
-        /* This exclusion control is a workaround; should be revised */
+        down_read(&NILFS_MDT(dat)->mi_sem);
-        down_read(&NILFS_MDT(dat)->mi_sem);     /* XXX */
+        ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
-        ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
+        up_read(&NILFS_MDT(dat)->mi_sem);
-        up_read(&NILFS_MDT(dat)->mi_sem);       /* XXX */
+        if (ret >= 0) { /* found */
-        if (ret == 0) { /* found */
                map_bh(bh_result, inode->i_sb, blknum);
+                if (ret > 0)
+                        bh_result->b_size = (ret << inode->i_blkbits);
                goto out;
        }
        /* data block was not found */
@@ -240,7 +241,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 struct address_space_operations nilfs_aops = {
        .writepage              = nilfs_writepage,
        .readpage               = nilfs_readpage,
-        /* .sync_page           = nilfs_sync_page, */
+        .sync_page              = block_sync_page,
        .writepages             = nilfs_writepages,
        .set_page_dirty         = nilfs_set_page_dirty,
        .readpages              = nilfs_readpages,
@@ -249,6 +250,7 @@ struct address_space_operations nilfs_aops = {
        /* .releasepage         = nilfs_releasepage, */
        .invalidatepage         = block_invalidatepage,
        .direct_IO              = nilfs_direct_IO,
+        .is_partially_uptodate  = block_is_partially_uptodate,
 };
 struct inode *nilfs_new_inode(struct inode *dir, int mode)
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index d6759b92006f..6ea5f872e2de 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -152,7 +152,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        down_read(&nilfs->ns_segctor_sem);
        ret = nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
-                                      nmembs);
+                                      size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -182,7 +182,8 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
+        ret = nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, size,
+                                      nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -212,7 +213,7 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
        int ret;
        down_read(&nilfs->ns_segctor_sem);
-        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
+        ret = nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, size, nmembs);
        up_read(&nilfs->ns_segctor_sem);
        return ret;
 }
@@ -435,24 +436,6 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
        return nmembs;
 }
-static int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
-                                     struct nilfs_argv *argv, void *buf)
-{
-        size_t nmembs = argv->v_nmembs;
-        struct nilfs_sb_info *sbi = nilfs->ns_writer;
-        int ret;
-        if (unlikely(!sbi)) {
-                /* never happens because called for a writable mount */
-                WARN_ON(1);
-                return -EROFS;
-        }
-        ret = nilfs_segctor_add_segments_to_be_freed(
-                NILFS_SC(sbi), buf, nmembs);
-        return (ret < 0) ? ret : nmembs;
-}
 int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
                                       struct nilfs_argv *argv, void **kbufs)
 {
@@ -491,14 +474,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
                msg = "cannot mark copying blocks dirty";
                goto failed;
        }
-        ret = nilfs_ioctl_free_segments(nilfs, &argv[4], kbufs[4]);
-        if (ret < 0) {
-                /*
-                 * can safely abort because this operation is atomic.
-                 */
-                msg = "cannot set segments to be freed";
-                goto failed;
-        }
        return 0;
 failed:
@@ -615,7 +590,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
        if (copy_from_user(&argv, argp, sizeof(argv)))
                return -EFAULT;
-        if (argv.v_size != membsz)
+        if (argv.v_size < membsz)
                return -EINVAL;
        ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), dofunc);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index bb78745a0e30..3d3ddb3f5177 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -430,6 +430,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
 static struct address_space_operations def_mdt_aops = {
        .writepage              = nilfs_mdt_write_page,
+        .sync_page              = block_sync_page,
 };
 static struct inode_operations def_mdt_iops;
@@ -449,7 +450,7 @@ struct inode *
 nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
                     ino_t ino, gfp_t gfp_mask)
 {
-        struct inode *inode = nilfs_alloc_inode(sb);
+        struct inode *inode = nilfs_alloc_inode_common(nilfs);
        if (!inode)
                return NULL;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index da6fc0bba2e5..edf6a59d9f2a 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -263,6 +263,7 @@ extern void nilfs_dirty_inode(struct inode *);
 extern struct dentry *nilfs_get_parent(struct dentry *);
 /* super.c */
+extern struct inode *nilfs_alloc_inode_common(struct the_nilfs *);
 extern struct inode *nilfs_alloc_inode(struct super_block *);
 extern void nilfs_destroy_inode(struct inode *);
 extern void nilfs_error(struct super_block *, const char *, const char *, ...)
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 57afa9d24061..d80cc71be749 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -28,7 +28,6 @@
 #include "segment.h"
 #include "sufile.h"
 #include "page.h"
-#include "seglist.h"
 #include "segbuf.h"
 /*
@@ -395,6 +394,24 @@ static void dispose_recovery_list(struct list_head *head)
        }
 }
+struct nilfs_segment_entry {
+        struct list_head        list;
+        __u64                   segnum;
+};
+static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
+{
+        struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+        if (unlikely(!ent))
+                return -ENOMEM;
+        ent->segnum = segnum;
+        INIT_LIST_HEAD(&ent->list);
+        list_add_tail(&ent->list, head);
+        return 0;
+}
 void nilfs_dispose_segment_list(struct list_head *head)
 {
        while (!list_empty(head)) {
@@ -402,7 +419,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
                        = list_entry(head->next,
                                     struct nilfs_segment_entry, list);
                list_del(&ent->list);
-                nilfs_free_segment_entry(ent);
+                kfree(ent);
        }
 }
@@ -431,12 +448,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
        if (unlikely(err))
                goto failed;
-        err = -ENOMEM;
        for (i = 1; i < 4; i++) {
-                ent = nilfs_alloc_segment_entry(segnum[i]);
+                err = nilfs_segment_list_add(head, segnum[i]);
-                if (unlikely(!ent))
+                if (unlikely(err))
                        goto failed;
-                list_add_tail(&ent->list, head);
        }
        /*
@@ -450,7 +465,7 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
                                goto failed;
                }
                list_del(&ent->list);
-                nilfs_free_segment_entry(ent);
+                kfree(ent);
        }
        /* Allocate new segments for recovery */
@@ -791,7 +806,6 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
        u64 seg_seq;
        __u64 segnum, nextnum = 0;
        __u64 cno;
-        struct nilfs_segment_entry *ent;
        LIST_HEAD(segments);
        int empty_seg = 0, scan_newer = 0;
        int ret;
@@ -892,12 +906,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
                if (empty_seg++)
                        goto super_root_found; /* found a valid super root */
-                ent = nilfs_alloc_segment_entry(segnum);
+                ret = nilfs_segment_list_add(&segments, segnum);
-                if (unlikely(!ent)) {
+                if (unlikely(ret))
-                        ret = -ENOMEM;
                        goto failed;
-                }
-                list_add_tail(&ent->list, &segments);
                seg_seq++;
                segnum = nextnum;
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index adccd4fc654e..0776ccc2504a 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -60,6 +60,7 @@ struct nilfs_sb_info {
        struct super_block *s_super;    /* reverse pointer to super_block */
        struct the_nilfs *s_nilfs;
        struct list_head s_list;        /* list head for nilfs->ns_supers */
+        atomic_t s_count;               /* reference count */
        /* Segment constructor */
        struct list_head s_dirty_files; /* dirty files list */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 1e68821b4a9b..9e3fe17bb96b 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -26,7 +26,6 @@
 #include <linux/crc32.h>
 #include "page.h"
 #include "segbuf.h"
-#include "seglist.h"
 static struct kmem_cache *nilfs_segbuf_cachep;
@@ -394,7 +393,7 @@ int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
                 * Last BIO is always sent through the following
                 * submission.
                 */
-                rw |= (1 << BIO_RW_SYNCIO);
+                rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
                res = nilfs_submit_seg_bio(wi, rw);
                if (unlikely(res))
                        goto failed_bio;
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
deleted file mode 100644
index d39df9144e99..000000000000
--- a/fs/nilfs2/seglist.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * seglist.h - expediential structure and routines to handle list of segments
- *             (would be removed in a future release)
- *
- * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- * Written by Ryusuke Konishi <ryusuke@osrg.net>
- *
- */
-#ifndef _NILFS_SEGLIST_H
-#define _NILFS_SEGLIST_H
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
-#include "sufile.h"
-struct nilfs_segment_entry {
-        __u64                   segnum;
-#define NILFS_SLH_FREED         0x0001  /* The segment was freed provisonally.
-                                           It must be cancelled if
-                                           construction aborted */
-        unsigned                flags;
-        struct list_head        list;
-        struct buffer_head     *bh_su;
-        struct nilfs_segment_usage *raw_su;
-};
-void nilfs_dispose_segment_list(struct list_head *);
-static inline struct nilfs_segment_entry *
-nilfs_alloc_segment_entry(__u64 segnum)
-{
-        struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
-        if (likely(ent)) {
-                ent->segnum = segnum;
-                ent->flags = 0;
-                ent->bh_su = NULL;
-                ent->raw_su = NULL;
-                INIT_LIST_HEAD(&ent->list);
-        }
-        return ent;
-}
-static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
-                                           struct inode *sufile)
-{
-        return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
-                                              &ent->raw_su, &ent->bh_su);
-}
-static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
-                                             struct inode *sufile)
-{
-        if (!ent->bh_su)
-                return;
-        nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
-        ent->bh_su = NULL;
-        ent->raw_su = NULL;
-}
-static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
-{
-        kfree(ent);
-}
-#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 22c7f65c2403..aa977549919e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -39,7 +39,6 @@
 #include "sufile.h"
 #include "cpfile.h"
 #include "ifile.h"
-#include "seglist.h"
 #include "segbuf.h"
@@ -79,7 +78,8 @@ enum {
 /* State flags of collection */
 #define NILFS_CF_NODE           0x0001  /* Collecting node blocks */
 #define NILFS_CF_IFILE_STARTED  0x0002  /* IFILE stage has started */
-#define NILFS_CF_HISTORY_MASK   (NILFS_CF_IFILE_STARTED)
+#define NILFS_CF_SUFREED        0x0004  /* segment usages has been freed */
+#define NILFS_CF_HISTORY_MASK   (NILFS_CF_IFILE_STARTED | NILFS_CF_SUFREED)
 /* Operations depending on the construction mode and file type */
 struct nilfs_sc_operations {
@@ -810,7 +810,7 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
 {
        return list_empty(&sci->sc_dirty_files) &&
                !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
-                list_empty(&sci->sc_cleaning_segments) &&
+                sci->sc_nfreesegs == 0 &&
                (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
 }
@@ -1005,44 +1005,6 @@ static void nilfs_drop_collected_inodes(struct list_head *head)
        }
 }
-static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
-                                               struct inode *sufile)
-{
-        struct list_head *head = &sci->sc_cleaning_segments;
-        struct nilfs_segment_entry *ent;
-        int err;
-        list_for_each_entry(ent, head, list) {
-                if (!(ent->flags & NILFS_SLH_FREED))
-                        break;
-                err = nilfs_sufile_cancel_free(sufile, ent->segnum);
-                WARN_ON(err); /* do not happen */
-                ent->flags &= ~NILFS_SLH_FREED;
-        }
-}
-static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
-                                               struct inode *sufile)
-{
-        struct list_head *head = &sci->sc_cleaning_segments;
-        struct nilfs_segment_entry *ent;
-        int err;
-        list_for_each_entry(ent, head, list) {
-                err = nilfs_sufile_free(sufile, ent->segnum);
-                if (unlikely(err))
-                        return err;
-                ent->flags |= NILFS_SLH_FREED;
-        }
-        return 0;
-}
-static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
-{
-        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
-}
 static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
                                       struct inode *inode,
                                       struct list_head *listp,
@@ -1161,6 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
        struct the_nilfs *nilfs = sbi->s_nilfs;
        struct list_head *head;
        struct nilfs_inode_info *ii;
+        size_t ndone;
        int err = 0;
        switch (sci->sc_stage.scnt) {
@@ -1250,10 +1213,16 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
                        break;
                sci->sc_stage.scnt++;  /* Fall through */
        case NILFS_ST_SUFILE:
-                err = nilfs_segctor_prepare_free_segments(sci,
+                err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
-                                                          nilfs->ns_sufile);
+                                         sci->sc_nfreesegs, &ndone);
-                if (unlikely(err))
+                if (unlikely(err)) {
+                        nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                  sci->sc_freesegs, ndone,
+                                                  NULL);
                        break;
+                }
+                sci->sc_stage.flags |= NILFS_CF_SUFREED;
                err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
                                              &nilfs_sc_file_ops);
                if (unlikely(err))
@@ -1486,7 +1455,15 @@ static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
 {
        if (unlikely(err)) {
                nilfs_segctor_free_incomplete_segments(sci, nilfs);
-                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+                        int ret;
+                        ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                        sci->sc_freesegs,
+                                                        sci->sc_nfreesegs,
+                                                        NULL);
+                        WARN_ON(ret); /* do not happen */
+                }
        }
        nilfs_segctor_clear_segment_buffers(sci);
 }
@@ -1585,7 +1562,13 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
                if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
                        break;
-                nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+                if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
+                        err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
+                                                        sci->sc_freesegs,
+                                                        sci->sc_nfreesegs,
+                                                        NULL);
+                        WARN_ON(err); /* do not happen */
+                }
                nilfs_segctor_clear_segment_buffers(sci);
                err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
@@ -2224,10 +2207,8 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
                nilfs_segctor_complete_write(sci);
                /* Commit segments */
-                if (has_sr) {
+                if (has_sr)
-                        nilfs_segctor_commit_free_segments(sci);
                        nilfs_segctor_clear_metadata_dirty(sci);
-                }
                nilfs_segctor_end_construction(sci, nilfs, 0);
@@ -2301,48 +2282,6 @@ void nilfs_flush_segment(struct super_block *sb, ino_t ino)
                                        /* assign bit 0 to data files */
 }
-int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
-                                           __u64 *segnum, size_t nsegs)
-{
-        struct nilfs_segment_entry *ent;
-        struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
-        struct inode *sufile = nilfs->ns_sufile;
-        LIST_HEAD(list);
-        __u64 *pnum;
-        size_t i;
-        int err;
-        for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
-                ent = nilfs_alloc_segment_entry(*pnum);
-                if (unlikely(!ent)) {
-                        err = -ENOMEM;
-                        goto failed;
-                }
-                list_add_tail(&ent->list, &list);
-                err = nilfs_open_segment_entry(ent, sufile);
-                if (unlikely(err))
-                        goto failed;
-                if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
-                        printk(KERN_WARNING "NILFS: unused segment is "
-                               "requested to be cleaned (segnum=%llu)\n",
-                               (unsigned long long)ent->segnum);
-                nilfs_close_segment_entry(ent, sufile);
-        }
-        list_splice(&list, sci->sc_cleaning_segments.prev);
-        return 0;
- failed:
-        nilfs_dispose_segment_list(&list);
-        return err;
-}
-void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
-{
-        nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
-}
 struct nilfs_segctor_wait_request {
        wait_queue_t    wq;
        __u32           seq;
@@ -2607,10 +2546,13 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        err = nilfs_init_gcdat_inode(nilfs);
        if (unlikely(err))
                goto out_unlock;
        err = nilfs_ioctl_prepare_clean_segments(nilfs, argv, kbufs);
        if (unlikely(err))
                goto out_unlock;
+        sci->sc_freesegs = kbufs[4];
+        sci->sc_nfreesegs = argv[4].v_nmembs;
        list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
        for (;;) {
@@ -2629,6 +2571,8 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
        }
 out_unlock:
+        sci->sc_freesegs = NULL;
+        sci->sc_nfreesegs = 0;
        nilfs_clear_gcdat_inode(nilfs);
        nilfs_transaction_unlock(sbi);
        return err;
@@ -2835,7 +2779,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
        INIT_LIST_HEAD(&sci->sc_dirty_files);
        INIT_LIST_HEAD(&sci->sc_segbufs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
-        INIT_LIST_HEAD(&sci->sc_cleaning_segments);
        INIT_LIST_HEAD(&sci->sc_copied_buffers);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2901,9 +2844,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
                nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
        }
-        if (!list_empty(&sci->sc_cleaning_segments))
-                nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
        WARN_ON(!list_empty(&sci->sc_segbufs));
        down_write(&sbi->s_nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 476bdd5df5be..0d2a475a741b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -90,8 +90,9 @@ struct nilfs_segsum_pointer {
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
 * @sc_gc_inodes: List of GC inodes having blocks to be written
- * @sc_cleaning_segments: List of segments to be freed through construction
 * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
+ * @sc_freesegs: array of segment numbers to be freed
+ * @sc_nfreesegs: number of segments on @sc_freesegs
 * @sc_dsync_inode: inode whose data pages are written for a sync operation
 * @sc_dsync_start: start byte offset of data pages
 * @sc_dsync_end: end byte offset of data pages (inclusive)
@@ -131,9 +132,11 @@ struct nilfs_sc_info {
        struct list_head        sc_dirty_files;
        struct list_head        sc_gc_inodes;
-        struct list_head        sc_cleaning_segments;
        struct list_head        sc_copied_buffers;
+        __u64                  *sc_freesegs;
+        size_t                  sc_nfreesegs;
        struct nilfs_inode_info *sc_dsync_inode;
        loff_t                  sc_dsync_start;
        loff_t                  sc_dsync_end;
@@ -225,10 +228,6 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
 extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
                                void **);
-extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
-                                                  __u64 *, size_t);
-extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
 extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
 extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
@@ -240,5 +239,6 @@ extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
 extern int nilfs_recover_logical_segments(struct the_nilfs *,
                                          struct nilfs_sb_info *,
                                          struct nilfs_recovery_info *);
+extern void nilfs_dispose_segment_list(struct list_head *);
 #endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 98e68677f045..37994d4a59cc 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -18,6 +18,7 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Written by Koji Sato <koji@osrg.net>.
+ * Rivised by Ryusuke Konishi <ryusuke@osrg.net>.
 */
 #include <linux/kernel.h>
@@ -108,6 +109,102 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
        nilfs_mdt_mark_buffer_dirty(header_bh);
 }
+/**
+ * nilfs_sufile_updatev - modify multiple segment usages at a time
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @create: creation flag
+ * @ndone: place to store number of modified segments on @segnumv
+ * @dofunc: primitive operation for the update
+ *
+ * Description: nilfs_sufile_updatev() repeatedly calls @dofunc
+ * against the given array of segments.  The @dofunc is called with
+ * buffers of a header block and the sufile block in which the target
+ * segment usage entry is contained.  If @ndone is given, the number
+ * of successfully modified segments from the head is stored in the
+ * place @ndone points to.
+ *
+ * Return Value: On success, zero is returned.  On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Given segment usage is in hole block (may be returned if
+ *            @create is zero)
+ *
+ * %-EINVAL - Invalid segment usage number
+ */
+int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
+                         int create, size_t *ndone,
+                         void (*dofunc)(struct inode *, __u64,
+                                        struct buffer_head *,
+                                        struct buffer_head *))
+{
+        struct buffer_head *header_bh, *bh;
+        unsigned long blkoff, prev_blkoff;
+        __u64 *seg;
+        size_t nerr = 0, n = 0;
+        int ret = 0;
+        if (unlikely(nsegs == 0))
+                goto out;
+        down_write(&NILFS_MDT(sufile)->mi_sem);
+        for (seg = segnumv; seg < segnumv + nsegs; seg++) {
+                if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
+                        printk(KERN_WARNING
+                               "%s: invalid segment number: %llu\n", __func__,
+                               (unsigned long long)*seg);
+                        nerr++;
+                }
+        }
+        if (nerr > 0) {
+                ret = -EINVAL;
+                goto out_sem;
+        }
+        ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+        if (ret < 0)
+                goto out_sem;
+        seg = segnumv;
+        blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+        ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+        if (ret < 0)
+                goto out_header;
+        for (;;) {
+                dofunc(sufile, *seg, header_bh, bh);
+                if (++seg >= segnumv + nsegs)
+                        break;
+                prev_blkoff = blkoff;
+                blkoff = nilfs_sufile_get_blkoff(sufile, *seg);
+                if (blkoff == prev_blkoff)
+                        continue;
+                /* get different block */
+                brelse(bh);
+                ret = nilfs_mdt_get_block(sufile, blkoff, create, NULL, &bh);
+                if (unlikely(ret < 0))
+                        goto out_header;
+        }
+        brelse(bh);
+ out_header:
+        n = seg - segnumv;
+        brelse(header_bh);
+ out_sem:
+        up_write(&NILFS_MDT(sufile)->mi_sem);
+ out:
+        if (ndone)
+                *ndone = n;
+        return ret;
+}
 int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
                        void (*dofunc)(struct inode *, __u64,
                                       struct buffer_head *,
@@ -490,7 +587,8 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 * nilfs_sufile_get_suinfo -
 * @sufile: inode of segment usage file
 * @segnum: segment number to start looking
- * @si: array of suinfo
+ * @buf: array of suinfo
+ * @sisz: byte size of suinfo
 * @nsi: size of suinfo array
 *
 * Description:
@@ -502,11 +600,12 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 *
 * %-ENOMEM - Insufficient amount of memory available.
 */
-ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
-                                struct nilfs_suinfo *si, size_t nsi)
+                                unsigned sisz, size_t nsi)
 {
        struct buffer_head *su_bh;
        struct nilfs_segment_usage *su;
+        struct nilfs_suinfo *si = buf;
        size_t susz = NILFS_MDT(sufile)->mi_entry_size;
        struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
        void *kaddr;
@@ -531,20 +630,22 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
                        if (ret != -ENOENT)
                                goto out;
                        /* hole */
-                        memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
+                        memset(si, 0, sisz * n);
+                        si = (void *)si + sisz * n;
                        continue;
                }
                kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
                su = nilfs_sufile_block_get_segment_usage(
                        sufile, segnum, su_bh, kaddr);
-                for (j = 0; j < n; j++, su = (void *)su + susz) {
+                for (j = 0; j < n;
-                        si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
+                     j++, su = (void *)su + susz, si = (void *)si + sisz) {
-                        si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
+                        si->sui_lastmod = le64_to_cpu(su->su_lastmod);
-                        si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
+                        si->sui_nblocks = le32_to_cpu(su->su_nblocks);
+                        si->sui_flags = le32_to_cpu(su->su_flags) &
                                ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
                        if (nilfs_segment_is_active(nilfs, segnum + j))
-                                si[i + j].sui_flags |=
+                                si->sui_flags |=
                                        (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
                }
                kunmap_atomic(kaddr, KM_USER0);
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2e2efd4ade1..a2c4d76c3366 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -43,43 +43,27 @@ void nilfs_sufile_put_segment_usage(struct inode *, __u64,
                                    struct buffer_head *);
 int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
 int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
-ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
                                size_t);
+int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
+                         void (*dofunc)(struct inode *, __u64,
+                                        struct buffer_head *,
+                                        struct buffer_head *));
 int nilfs_sufile_update(struct inode *, __u64, int,
                        void (*dofunc)(struct inode *, __u64,
                                       struct buffer_head *,
                                       struct buffer_head *));
-void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
-                                 struct buffer_head *);
 void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
                           struct buffer_head *);
 void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
                          struct buffer_head *);
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+                                 struct buffer_head *);
 void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
                               struct buffer_head *);
 /**
- * nilfs_sufile_cancel_free -
- * @sufile: inode of segment usage file
- * @segnum: segment number
- *
- * Description:
- *
- * Return Value: On success, 0 is returned. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- */
-static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
-{
-        return nilfs_sufile_update(sufile, segnum, 0,
-                                   nilfs_sufile_do_cancel_free);
-}
-/**
 * nilfs_sufile_scrap - make a segment garbage
 * @sufile: inode of segment usage file
 * @segnum: segment number to be freed
@@ -100,6 +84,38 @@ static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
 }
 /**
+ * nilfs_sufile_freev - free segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of freed segments
+ */
+static inline int nilfs_sufile_freev(struct inode *sufile, __u64 *segnumv,
+                                     size_t nsegs, size_t *ndone)
+{
+        return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+                                    nilfs_sufile_do_free);
+}
+/**
+ * nilfs_sufile_cancel_freev - reallocate freeing segments
+ * @sufile: inode of segment usage file
+ * @segnumv: array of segment numbers
+ * @nsegs: size of @segnumv array
+ * @ndone: place to store the number of cancelled segments
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error codes
+ * is returned.
+ */
+static inline int nilfs_sufile_cancel_freev(struct inode *sufile,
+                                            __u64 *segnumv, size_t nsegs,
+                                            size_t *ndone)
+{
+        return nilfs_sufile_updatev(sufile, segnumv, nsegs, 0, ndone,
+                                    nilfs_sufile_do_cancel_free);
+}
+/**
 * nilfs_sufile_set_error - mark a segment as erroneous
 * @sufile: inode of segment usage file
 * @segnum: segment number
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6989b03e97ab..ab785f85aa50 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -65,9 +65,8 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
+static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
-static int test_exclusive_mount(struct file_system_type *fs_type,
-                                struct block_device *bdev, int flags);
 /**
 * nilfs_error() - report failure condition on a filesystem
@@ -134,7 +133,7 @@ void nilfs_warning(struct super_block *sb, const char *function,
 static struct kmem_cache *nilfs_inode_cachep;
-struct inode *nilfs_alloc_inode(struct super_block *sb)
+struct inode *nilfs_alloc_inode_common(struct the_nilfs *nilfs)
 {
        struct nilfs_inode_info *ii;
@@ -144,10 +143,15 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        ii->i_bh = NULL;
        ii->i_state = 0;
        ii->vfs_inode.i_version = 1;
-        nilfs_btnode_cache_init(&ii->i_btnode_cache);
+        nilfs_btnode_cache_init(&ii->i_btnode_cache, nilfs->ns_bdi);
        return &ii->vfs_inode;
 }
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+        return nilfs_alloc_inode_common(NILFS_SB(sb)->s_nilfs);
+}
 void nilfs_destroy_inode(struct inode *inode)
 {
        kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
@@ -315,6 +319,11 @@ static void nilfs_put_super(struct super_block *sb)
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
+        lock_kernel();
+        if (sb->s_dirt)
+                nilfs_write_super(sb);
        nilfs_detach_segment_constructor(sbi);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -323,12 +332,18 @@ static void nilfs_put_super(struct super_block *sb)
                nilfs_commit_super(sbi, 1);
                up_write(&nilfs->ns_sem);
        }
+        down_write(&nilfs->ns_super_sem);
+        if (nilfs->ns_current == sbi)
+                nilfs->ns_current = NULL;
+        up_write(&nilfs->ns_super_sem);
        nilfs_detach_checkpoint(sbi);
        put_nilfs(sbi->s_nilfs);
        sbi->s_super = NULL;
        sb->s_fs_info = NULL;
-        kfree(sbi);
+        nilfs_put_sbinfo(sbi);
+        unlock_kernel();
 }
 /**
@@ -383,6 +398,8 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
        int err = 0;
+        nilfs_write_super(sb);
        /* This function is called when super block should be written back */
        if (wait)
                err = nilfs_construct_segment(sb);
@@ -396,9 +413,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        struct buffer_head *bh_cp;
        int err;
-        down_write(&nilfs->ns_sem);
+        down_write(&nilfs->ns_super_sem);
        list_add(&sbi->s_list, &nilfs->ns_supers);
-        up_write(&nilfs->ns_sem);
+        up_write(&nilfs->ns_super_sem);
        sbi->s_ifile = nilfs_mdt_new(
                nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
@@ -436,9 +453,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
-        down_write(&nilfs->ns_sem);
+        down_write(&nilfs->ns_super_sem);
        list_del_init(&sbi->s_list);
-        up_write(&nilfs->ns_sem);
+        up_write(&nilfs->ns_super_sem);
        return err;
 }
@@ -450,9 +467,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
        nilfs_mdt_clear(sbi->s_ifile);
        nilfs_mdt_destroy(sbi->s_ifile);
        sbi->s_ifile = NULL;
-        down_write(&nilfs->ns_sem);
+        down_write(&nilfs->ns_super_sem);
        list_del_init(&sbi->s_list);
-        up_write(&nilfs->ns_sem);
+        up_write(&nilfs->ns_super_sem);
 }
 static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
@@ -752,7 +769,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
 * @silent: silent mode flag
 * @nilfs: the_nilfs struct
 *
- * This function is called exclusively by bd_mount_mutex.
+ * This function is called exclusively by nilfs->ns_mount_mutex.
 * So, the recovery process is protected from other simultaneous mounts.
 */
 static int
@@ -773,6 +790,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        get_nilfs(nilfs);
        sbi->s_nilfs = nilfs;
        sbi->s_super = sb;
+        atomic_set(&sbi->s_count, 1);
        err = init_nilfs(nilfs, sbi, (char *)data);
        if (err)
@@ -870,6 +888,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
                goto failed_root;
        }
+        down_write(&nilfs->ns_super_sem);
+        if (!nilfs_test_opt(sbi, SNAPSHOT))
+                nilfs->ns_current = sbi;
+        up_write(&nilfs->ns_super_sem);
        return 0;
 failed_root:
@@ -885,7 +908,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 failed_sbi:
        put_nilfs(nilfs);
        sb->s_fs_info = NULL;
-        kfree(sbi);
+        nilfs_put_sbinfo(sbi);
        return err;
 }
@@ -898,6 +921,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
        struct nilfs_mount_options old_opts;
        int err;
+        lock_kernel();
+        down_write(&nilfs->ns_super_sem);
        old_sb_flags = sb->s_flags;
        old_opts.mount_opt = sbi->s_mount_opt;
        old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -945,14 +971,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                 * store the current valid flag.  (It may have been changed
                 * by fsck since we originally mounted the partition.)
                 */
-                down(&sb->s_bdev->bd_mount_sem);
+                if (nilfs->ns_current && nilfs->ns_current != sbi) {
-                /* Check existing RW-mount */
-                if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
                        printk(KERN_WARNING "NILFS (device %s): couldn't "
-                               "remount because a RW-mount exists.\n",
+                               "remount because an RW-mount exists.\n",
                               sb->s_id);
                        err = -EBUSY;
-                        goto rw_remount_failed;
+                        goto restore_opts;
                }
                if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
                        printk(KERN_WARNING "NILFS (device %s): couldn't "
@@ -960,7 +984,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                               "the latest one.\n",
                               sb->s_id);
                        err = -EINVAL;
-                        goto rw_remount_failed;
+                        goto restore_opts;
                }
                sb->s_flags &= ~MS_RDONLY;
                nilfs_clear_opt(sbi, SNAPSHOT);
@@ -968,28 +992,31 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
                err = nilfs_attach_segment_constructor(sbi);
                if (err)
-                        goto rw_remount_failed;
+                        goto restore_opts;
                down_write(&nilfs->ns_sem);
                nilfs_setup_super(sbi);
                up_write(&nilfs->ns_sem);
-                up(&sb->s_bdev->bd_mount_sem);
+                nilfs->ns_current = sbi;
        }
 out:
+        up_write(&nilfs->ns_super_sem);
+        unlock_kernel();
        return 0;
- rw_remount_failed:
-        up(&sb->s_bdev->bd_mount_sem);
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.mount_opt;
        sbi->s_snapshot_cno = old_opts.snapshot_cno;
+        up_write(&nilfs->ns_super_sem);
+        unlock_kernel();
        return err;
 }
 struct nilfs_super_data {
        struct block_device *bdev;
+        struct nilfs_sb_info *sbi;
        __u64 cno;
        int flags;
 };
@@ -1048,33 +1075,7 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
        struct nilfs_super_data *sd = data;
-        return s->s_bdev == sd->bdev;
+        return sd->sbi && s->s_fs_info == (void *)sd->sbi;
-}
-static int nilfs_test_bdev_super2(struct super_block *s, void *data)
-{
-        struct nilfs_super_data *sd = data;
-        int ret;
-        if (s->s_bdev != sd->bdev)
-                return 0;
-        if (!((s->s_flags | sd->flags) & MS_RDONLY))
-                return 1; /* Reuse an old R/W-mode super_block */
-        if (s->s_flags & sd->flags & MS_RDONLY) {
-                if (down_read_trylock(&s->s_umount)) {
-                        ret = s->s_root &&
-                                (sd->cno == NILFS_SB(s)->s_snapshot_cno);
-                        up_read(&s->s_umount);
-                        /*
-                         * This path is locked with sb_lock by sget().
-                         * So, drop_super() causes deadlock.
-                         */
-                        return ret;
-                }
-        }
-        return 0;
 }
 static int
@@ -1082,8 +1083,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
             const char *dev_name, void *data, struct vfsmount *mnt)
 {
        struct nilfs_super_data sd;
-        struct super_block *s, *s2;
+        struct super_block *s;
-        struct the_nilfs *nilfs = NULL;
+        struct the_nilfs *nilfs;
        int err, need_to_close = 1;
        sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
@@ -1095,7 +1096,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
         * much more information than normal filesystems to identify mount
         * instance.  For snapshot mounts, not only a mount type (ro-mount
         * or rw-mount) but also a checkpoint number is required.
-         * The results are passed in sget() using nilfs_super_data.
         */
        sd.cno = 0;
        sd.flags = flags;
@@ -1104,64 +1104,59 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                goto failed;
        }
-        /*
+        nilfs = find_or_create_nilfs(sd.bdev);
-         * once the super is inserted into the list by sget, s_umount
+        if (!nilfs) {
-         * will protect the lockfs code from trying to start a snapshot
+                err = -ENOMEM;
-         * while we are mounting
+                goto failed;
-         */
-        down(&sd.bdev->bd_mount_sem);
-        if (!sd.cno &&
-            (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
-                err = (err < 0) ? : -EBUSY;
-                goto failed_unlock;
        }
-        /*
+        mutex_lock(&nilfs->ns_mount_mutex);
-         * Phase-1: search any existent instance and get the_nilfs
-         */
-        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-        if (IS_ERR(s))
-                goto error_s;
-        if (!s->s_root) {
-                err = -ENOMEM;
-                nilfs = alloc_nilfs(sd.bdev);
-                if (!nilfs)
-                        goto cancel_new;
-        } else {
-                struct nilfs_sb_info *sbi = NILFS_SB(s);
+        if (!sd.cno) {
                /*
-                 * s_umount protects super_block from unmount process;
+                 * Check if an exclusive mount exists or not.
-                 * It covers pointers of nilfs_sb_info and the_nilfs.
+                 * Snapshot mounts coexist with a current mount
+                 * (i.e. rw-mount or ro-mount), whereas rw-mount and
+                 * ro-mount are mutually exclusive.
                 */
-                nilfs = sbi->s_nilfs;
+                down_read(&nilfs->ns_super_sem);
-                get_nilfs(nilfs);
+                if (nilfs->ns_current &&
-                up_write(&s->s_umount);
+                    ((nilfs->ns_current->s_super->s_flags ^ flags)
+                     & MS_RDONLY)) {
+                        up_read(&nilfs->ns_super_sem);
+                        err = -EBUSY;
+                        goto failed_unlock;
+                }
+                up_read(&nilfs->ns_super_sem);
+        }
-                /*
+        /*
-                 * Phase-2: search specified snapshot or R/W mode super_block
+         * Find existing nilfs_sb_info struct
-                 */
+         */
-                if (!sd.cno)
+        sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
-                        /* trying to get the latest checkpoint.  */
-                        sd.cno = nilfs_last_cno(nilfs);
-                s2 = sget(fs_type, nilfs_test_bdev_super2,
+        if (!sd.cno)
-                          nilfs_set_bdev_super, &sd);
+                /* trying to get the latest checkpoint.  */
-                deactivate_super(s);
+                sd.cno = nilfs_last_cno(nilfs);
-                /*
-                 * Although deactivate_super() invokes close_bdev_exclusive() at
+        /*
-                 * kill_block_super().  Here, s is an existent mount; we need
+         * Get super block instance holding the nilfs_sb_info struct.
-                 * one more close_bdev_exclusive() call.
+         * A new instance is allocated if no existing mount is present or
-                 */
+         * existing instance has been unmounted.
-                s = s2;
+         */
-                if (IS_ERR(s))
+        s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-                        goto error_s;
+        if (sd.sbi)
+                nilfs_put_sbinfo(sd.sbi);
+        if (IS_ERR(s)) {
+                err = PTR_ERR(s);
+                goto failed_unlock;
        }
        if (!s->s_root) {
                char b[BDEVNAME_SIZE];
+                /* New superblock instance created */
                s->s_flags = flags;
                strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
                sb_set_blocksize(s, block_size(sd.bdev));
@@ -1172,26 +1167,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
                s->s_flags |= MS_ACTIVE;
                need_to_close = 0;
-        } else if (!(s->s_flags & MS_RDONLY)) {
-                err = -EBUSY;
        }
-        up(&sd.bdev->bd_mount_sem);
+        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
        if (need_to_close)
                close_bdev_exclusive(sd.bdev, flags);
        simple_set_mnt(mnt, s);
        return 0;
- error_s:
-        up(&sd.bdev->bd_mount_sem);
-        if (nilfs)
-                put_nilfs(nilfs);
-        close_bdev_exclusive(sd.bdev, flags);
-        return PTR_ERR(s);
 failed_unlock:
-        up(&sd.bdev->bd_mount_sem);
+        mutex_unlock(&nilfs->ns_mount_mutex);
+        put_nilfs(nilfs);
 failed:
        close_bdev_exclusive(sd.bdev, flags);
@@ -1199,70 +1186,18 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 cancel_new:
        /* Abandoning the newly allocated superblock */
-        up(&sd.bdev->bd_mount_sem);
+        mutex_unlock(&nilfs->ns_mount_mutex);
-        if (nilfs)
+        put_nilfs(nilfs);
-                put_nilfs(nilfs);
        up_write(&s->s_umount);
        deactivate_super(s);
        /*
         * deactivate_super() invokes close_bdev_exclusive().
         * We must finish all post-cleaning before this call;
-         * put_nilfs() and unlocking bd_mount_sem need the block device.
+         * put_nilfs() needs the block device.
         */
        return err;
 }
-static int nilfs_test_bdev_super3(struct super_block *s, void *data)
-{
-        struct nilfs_super_data *sd = data;
-        int ret;
-        if (s->s_bdev != sd->bdev)
-                return 0;
-        if (down_read_trylock(&s->s_umount)) {
-                ret = (s->s_flags & MS_RDONLY) && s->s_root &&
-                        nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
-                up_read(&s->s_umount);
-                if (ret)
-                        return 0; /* ignore snapshot mounts */
-        }
-        return !((sd->flags ^ s->s_flags) & MS_RDONLY);
-}
-static int __false_bdev_super(struct super_block *s, void *data)
-{
-#if 0 /* XXX: workaround for lock debug. This is not good idea */
-        up_write(&s->s_umount);
-#endif
-        return -EFAULT;
-}
-/**
- * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
- * fs_type: filesystem type
- * bdev: block device
- * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
- * res: pointer to an integer to store result
- *
- * This function must be called within a section protected by bd_mount_mutex.
- */
-static int test_exclusive_mount(struct file_system_type *fs_type,
-                                struct block_device *bdev, int flags)
-{
-        struct super_block *s;
-        struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
-        s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
-        if (IS_ERR(s)) {
-                if (PTR_ERR(s) != -EFAULT)
-                        return PTR_ERR(s);
-                return 0;  /* Not found */
-        }
-        up_write(&s->s_umount);
-        deactivate_super(s);
-        return 1;  /* Found */
-}
 struct file_system_type nilfs_fs_type = {
        .owner    = THIS_MODULE,
        .name     = "nilfs2",
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3be4aa9..8b8889825716 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -32,9 +32,12 @@
 #include "cpfile.h"
 #include "sufile.h"
 #include "dat.h"
-#include "seglist.h"
 #include "segbuf.h"
+static LIST_HEAD(nilfs_objects);
+static DEFINE_SPINLOCK(nilfs_lock);
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
                            sector_t start_blocknr, u64 seq, __u64 cno)
 {
@@ -55,7 +58,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
 * Return Value: On success, pointer to the_nilfs is returned.
 * On error, NULL is returned.
 */
-struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
        struct the_nilfs *nilfs;
@@ -68,7 +71,10 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
        atomic_set(&nilfs->ns_writer_refcount, -1);
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
+        init_rwsem(&nilfs->ns_super_sem);
+        mutex_init(&nilfs->ns_mount_mutex);
        mutex_init(&nilfs->ns_writer_mutex);
+        INIT_LIST_HEAD(&nilfs->ns_list);
        INIT_LIST_HEAD(&nilfs->ns_supers);
        spin_lock_init(&nilfs->ns_last_segment_lock);
        nilfs->ns_gc_inodes_h = NULL;
@@ -78,6 +84,45 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 }
 /**
+ * find_or_create_nilfs - find or create nilfs object
+ * @bdev: block device to which the_nilfs is related
+ *
+ * find_nilfs() looks up an existent nilfs object created on the
+ * device and gets the reference count of the object.  If no nilfs object
+ * is found on the device, a new nilfs object is allocated.
+ *
+ * Return Value: On success, pointer to the nilfs object is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
+{
+        struct the_nilfs *nilfs, *new = NULL;
+ retry:
+        spin_lock(&nilfs_lock);
+        list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
+                if (nilfs->ns_bdev == bdev) {
+                        get_nilfs(nilfs);
+                        spin_unlock(&nilfs_lock);
+                        if (new)
+                                put_nilfs(new);
+                        return nilfs; /* existing object */
+                }
+        }
+        if (new) {
+                list_add_tail(&new->ns_list, &nilfs_objects);
+                spin_unlock(&nilfs_lock);
+                return new; /* new object */
+        }
+        spin_unlock(&nilfs_lock);
+        new = alloc_nilfs(bdev);
+        if (new)
+                goto retry;
+        return NULL; /* insufficient memory */
+}
+/**
 * put_nilfs - release a reference to the_nilfs
 * @nilfs: the_nilfs structure to be released
 *
@@ -86,13 +131,20 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 */
 void put_nilfs(struct the_nilfs *nilfs)
 {
-        if (!atomic_dec_and_test(&nilfs->ns_count))
+        spin_lock(&nilfs_lock);
+        if (!atomic_dec_and_test(&nilfs->ns_count)) {
+                spin_unlock(&nilfs_lock);
                return;
+        }
+        list_del_init(&nilfs->ns_list);
+        spin_unlock(&nilfs_lock);
        /*
-         * Increment of ns_count never occur below because the caller
+         * Increment of ns_count never occurs below because the caller
         * of get_nilfs() holds at least one reference to the_nilfs.
         * Thus its exclusion control is not required here.
         */
        might_sleep();
        if (nilfs_loaded(nilfs)) {
                nilfs_mdt_clear(nilfs->ns_sufile);
@@ -515,7 +567,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
        if (sb->s_blocksize != blocksize) {
-                int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+                int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
                if (blocksize < hw_blocksize) {
                        printk(KERN_ERR
@@ -613,13 +665,63 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
        return ret;
 }
+/**
+ * nilfs_find_sbinfo - find existing nilfs_sb_info structure
+ * @nilfs: nilfs object
+ * @rw_mount: mount type (non-zero value for read/write mount)
+ * @cno: checkpoint number (zero for read-only mount)
+ *
+ * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
+ * @rw_mount and @cno (in case of snapshots) matched.  If no instance
+ * was found, NULL is returned.  Although the super block instance can
+ * be unmounted after this function returns, the nilfs_sb_info struct
+ * is kept on memory until nilfs_put_sbinfo() is called.
+ */
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
+                                        int rw_mount, __u64 cno)
+{
+        struct nilfs_sb_info *sbi;
+        down_read(&nilfs->ns_super_sem);
+        /*
+         * The SNAPSHOT flag and sb->s_flags are supposed to be
+         * protected with nilfs->ns_super_sem.
+         */
+        sbi = nilfs->ns_current;
+        if (rw_mount) {
+                if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
+                        goto found; /* read/write mount */
+                else
+                        goto out;
+        } else if (cno == 0) {
+                if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
+                        goto found; /* read-only mount */
+                else
+                        goto out;
+        }
+        list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+                if (nilfs_test_opt(sbi, SNAPSHOT) &&
+                    sbi->s_snapshot_cno == cno)
+                        goto found; /* snapshot mount */
+        }
+ out:
+        up_read(&nilfs->ns_super_sem);
+        return NULL;
+ found:
+        atomic_inc(&sbi->s_count);
+        up_read(&nilfs->ns_super_sem);
+        return sbi;
+}
 int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
                                int snapshot_mount)
 {
        struct nilfs_sb_info *sbi;
        int ret = 0;
-        down_read(&nilfs->ns_sem);
+        down_read(&nilfs->ns_super_sem);
        if (cno == 0 || cno > nilfs->ns_cno)
                goto out_unlock;
@@ -636,6 +738,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
                ret++;
 out_unlock:
-        up_read(&nilfs->ns_sem);
+        up_read(&nilfs->ns_super_sem);
        return ret;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 30fe58778d05..e8adbffc626f 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -43,12 +43,16 @@ enum {
 * struct the_nilfs - struct to supervise multiple nilfs mount points
 * @ns_flags: flags
 * @ns_count: reference count
+ * @ns_list: list head for nilfs_list
 * @ns_bdev: block device
 * @ns_bdi: backing dev info
 * @ns_writer: back pointer to writable nilfs_sb_info
 * @ns_sem: semaphore for shared states
+ * @ns_super_sem: semaphore for global operations across super block instances
+ * @ns_mount_mutex: mutex protecting mount process of nilfs
 * @ns_writer_mutex: mutex protecting ns_writer attach/detach
 * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_current: back pointer to current mount
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
 * @ns_sbwtime: previous write time of super blocks
@@ -88,15 +92,24 @@ enum {
 struct the_nilfs {
        unsigned long           ns_flags;
        atomic_t                ns_count;
+        struct list_head        ns_list;
        struct block_device    *ns_bdev;
        struct backing_dev_info *ns_bdi;
        struct nilfs_sb_info   *ns_writer;
        struct rw_semaphore     ns_sem;
+        struct rw_semaphore     ns_super_sem;
+        struct mutex            ns_mount_mutex;
        struct mutex            ns_writer_mutex;
        atomic_t                ns_writer_refcount;
        /*
+         * components protected by ns_super_sem
+         */
+        struct nilfs_sb_info   *ns_current;
+        struct list_head        ns_supers;
+        /*
         * used for
         * - loading the latest checkpoint exclusively.
         * - allocating a new full segment.
@@ -108,7 +121,6 @@ struct the_nilfs {
        time_t                  ns_sbwtime[2];
        unsigned                ns_sbsize;
        unsigned                ns_mount_state;
-        struct list_head        ns_supers;
        /*
         * Following fields are dedicated to a writable FS-instance.
@@ -191,11 +203,12 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 #define NILFS_ALTSB_FREQ        60  /* spare superblock */
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *alloc_nilfs(struct block_device *);
+struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
@@ -238,6 +251,12 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
        mutex_unlock(&nilfs->ns_writer_mutex);
 }
+static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
+{
+        if (!atomic_dec_and_test(&sbi->s_count))
+                kfree(sbi);
+}
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
                        sector_t *seg_start, sector_t *seg_end)
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 50914d7303c6..31dac7e3b0f1 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,2 +1,15 @@
+config FSNOTIFY
+        bool "Filesystem notification backend"
+        default y
+        ---help---
+           fsnotify is a backend for filesystem notification.  fsnotify does
+           not provide any userspace interface but does provide the basis
+           needed for other notification schemes such as dnotify, inotify,
+           and fanotify.
+           Say Y here to enable fsnotify suport.
+           If unsure, say Y.
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 5a95b6010ce7..0922cc826c46 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,2 +1,4 @@
+obj-$(CONFIG_FSNOTIFY)          += fsnotify.o notification.o group.o inode_mark.o
 obj-y                   += dnotify/
 obj-y                   += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 26adf5dfa646..904ff8d5405a 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,5 +1,6 @@
 config DNOTIFY
        bool "Dnotify support"
+        depends on FSNOTIFY
        default y
        help
          Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b0aa2cde80bd..828a889be909 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -3,6 +3,9 @@
 *
 * Copyright (C) 2000,2001,2002 Stephen Rothwell
 *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * dnotify was largly rewritten to use the new fsnotify infrastructure
+ *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
@@ -21,24 +24,173 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
 int dir_notify_enable __read_mostly = 1;
-static struct kmem_cache *dn_cache __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
+static struct fsnotify_group *dnotify_group __read_mostly;
+static DEFINE_MUTEX(dnotify_mark_mutex);
+/*
+ * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * is being watched by dnotify.  If multiple userspace applications are watching
+ * the same directory with dnotify their information is chained in dn
+ */
+struct dnotify_mark_entry {
+        struct fsnotify_mark_entry fsn_entry;
+        struct dnotify_struct *dn;
+};
-static void redo_inode_mask(struct inode *inode)
+/*
+ * When a process starts or stops watching an inode the set of events which
+ * dnotify cares about for that inode may change.  This function runs the
+ * list of everything receiving dnotify events about this directory and calculates
+ * the set of all those events.  After it updates what dnotify is interested in
+ * it calls the fsnotify function so it can update the set of all events relevant
+ * to this inode.
+ */
+static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 {
-        unsigned long new_mask;
+        __u32 new_mask, old_mask;
        struct dnotify_struct *dn;
+        struct dnotify_mark_entry *dnentry  = container_of(entry,
+                                                           struct dnotify_mark_entry,
+                                                           fsn_entry);
+        assert_spin_locked(&entry->lock);
+        old_mask = entry->mask;
        new_mask = 0;
-        for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
+        for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
-                new_mask |= dn->dn_mask & ~DN_MULTISHOT;
+                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-        inode->i_dnotify_mask = new_mask;
+        entry->mask = new_mask;
+        if (old_mask == new_mask)
+                return;
+        if (entry->inode)
+                fsnotify_recalc_inode_mask(entry->inode);
+}
+/*
+ * Mains fsnotify call where events are delivered to dnotify.
+ * Find the dnotify mark on the relevant inode, run the list of dnotify structs
+ * on that mark and determine which of them has expressed interest in receiving
+ * events of this type.  When found send the correct process and signal and
+ * destroy the dnotify struct if it was not registered to receive multiple
+ * events.
+ */
+static int dnotify_handle_event(struct fsnotify_group *group,
+                                struct fsnotify_event *event)
+{
+        struct fsnotify_mark_entry *entry = NULL;
+        struct dnotify_mark_entry *dnentry;
+        struct inode *to_tell;
+        struct dnotify_struct *dn;
+        struct dnotify_struct **prev;
+        struct fown_struct *fown;
+        to_tell = event->to_tell;
+        spin_lock(&to_tell->i_lock);
+        entry = fsnotify_find_mark_entry(group, to_tell);
+        spin_unlock(&to_tell->i_lock);
+        /* unlikely since we alreay passed dnotify_should_send_event() */
+        if (unlikely(!entry))
+                return 0;
+        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+        spin_lock(&entry->lock);
+        prev = &dnentry->dn;
+        while ((dn = *prev) != NULL) {
+                if ((dn->dn_mask & event->mask) == 0) {
+                        prev = &dn->dn_next;
+                        continue;
+                }
+                fown = &dn->dn_filp->f_owner;
+                send_sigio(fown, dn->dn_fd, POLL_MSG);
+                if (dn->dn_mask & FS_DN_MULTISHOT)
+                        prev = &dn->dn_next;
+                else {
+                        *prev = dn->dn_next;
+                        kmem_cache_free(dnotify_struct_cache, dn);
+                        dnotify_recalc_inode_mask(entry);
+                }
+        }
+        spin_unlock(&entry->lock);
+        fsnotify_put_mark(entry);
+        return 0;
+}
+/*
+ * Given an inode and mask determine if dnotify would be interested in sending
+ * userspace notification for that pair.
+ */
+static bool dnotify_should_send_event(struct fsnotify_group *group,
+                                      struct inode *inode, __u32 mask)
+{
+        struct fsnotify_mark_entry *entry;
+        bool send;
+        /* !dir_notify_enable should never get here, don't waste time checking
+        if (!dir_notify_enable)
+                return 0; */
+        /* not a dir, dnotify doesn't care */
+        if (!S_ISDIR(inode->i_mode))
+                return false;
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        /* no mark means no dnotify watch */
+        if (!entry)
+                return false;
+        mask = (mask & ~FS_EVENT_ON_CHILD);
+        send = (mask & entry->mask);
+        fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
+        return send;
+}
+static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+        struct dnotify_mark_entry *dnentry = container_of(entry,
+                                                          struct dnotify_mark_entry,
+                                                          fsn_entry);
+        BUG_ON(dnentry->dn);
+        kmem_cache_free(dnotify_mark_entry_cache, dnentry);
 }
+static struct fsnotify_ops dnotify_fsnotify_ops = {
+        .handle_event = dnotify_handle_event,
+        .should_send_event = dnotify_should_send_event,
+        .free_group_priv = NULL,
+        .freeing_mark = NULL,
+        .free_event_priv = NULL,
+};
+/*
+ * Called every time a file is closed.  Looks first for a dnotify mark on the
+ * inode.  If one is found run all of the ->dn entries attached to that
+ * mark for one relevant to this process closing the file and remove that
+ * dnotify_struct.  If that was the last dnotify_struct also remove the
+ * fsnotify_mark_entry.
+ */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
+        struct fsnotify_mark_entry *entry;
+        struct dnotify_mark_entry *dnentry;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
@@ -46,145 +198,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
        inode = filp->f_path.dentry->d_inode;
        if (!S_ISDIR(inode->i_mode))
                return;
        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
+        entry = fsnotify_find_mark_entry(dnotify_group, inode);
+        spin_unlock(&inode->i_lock);
+        if (!entry)
+                return;
+        dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+        mutex_lock(&dnotify_mark_mutex);
+        spin_lock(&entry->lock);
+        prev = &dnentry->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
-                        redo_inode_mask(inode);
+                        kmem_cache_free(dnotify_struct_cache, dn);
-                        kmem_cache_free(dn_cache, dn);
+                        dnotify_recalc_inode_mask(entry);
                        break;
                }
                prev = &dn->dn_next;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&entry->lock);
+        /* nothing else could have found us thanks to the dnotify_mark_mutex */
+        if (dnentry->dn == NULL)
+                fsnotify_destroy_mark_by_entry(entry);
+        fsnotify_recalc_group_mask(dnotify_group);
+        mutex_unlock(&dnotify_mark_mutex);
+        fsnotify_put_mark(entry);
+}
+/* this conversion is done only at watch creation */
+static __u32 convert_arg(unsigned long arg)
+{
+        __u32 new_mask = FS_EVENT_ON_CHILD;
+        if (arg & DN_MULTISHOT)
+                new_mask |= FS_DN_MULTISHOT;
+        if (arg & DN_DELETE)
+                new_mask |= (FS_DELETE | FS_MOVED_FROM);
+        if (arg & DN_MODIFY)
+                new_mask |= FS_MODIFY;
+        if (arg & DN_ACCESS)
+                new_mask |= FS_ACCESS;
+        if (arg & DN_ATTRIB)
+                new_mask |= FS_ATTRIB;
+        if (arg & DN_RENAME)
+                new_mask |= FS_DN_RENAME;
+        if (arg & DN_CREATE)
+                new_mask |= (FS_CREATE | FS_MOVED_TO);
+        return new_mask;
 }
+/*
+ * If multiple processes watch the same inode with dnotify there is only one
+ * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * onto that mark.  This function either attaches the new dnotify_struct onto
+ * that list, or it |= the mask onto an existing dnofiy_struct.
+ */
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
+                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
+{
+        struct dnotify_struct *odn;
+        odn = dnentry->dn;
+        while (odn != NULL) {
+                /* adding more events to existing dnofiy_struct? */
+                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+                        odn->dn_fd = fd;
+                        odn->dn_mask |= mask;
+                        return -EEXIST;
+                }
+                odn = odn->dn_next;
+        }
+        dn->dn_mask = mask;
+        dn->dn_fd = fd;
+        dn->dn_filp = filp;
+        dn->dn_owner = id;
+        dn->dn_next = dnentry->dn;
+        dnentry->dn = dn;
+        return 0;
+}
+/*
+ * When a process calls fcntl to attach a dnotify watch to a directory it ends
+ * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
+ * attached to the fsnotify_mark.
+ */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
+        struct dnotify_mark_entry *new_dnentry, *dnentry;
+        struct fsnotify_mark_entry *new_entry, *entry;
        struct dnotify_struct *dn;
-        struct dnotify_struct *odn;
-        struct dnotify_struct **prev;
        struct inode *inode;
        fl_owner_t id = current->files;
        struct file *f;
-        int error = 0;
+        int destroy = 0, error = 0;
+        __u32 mask;
+        /* we use these to tell if we need to kfree */
+        new_entry = NULL;
+        dn = NULL;
+        if (!dir_notify_enable) {
+                error = -EINVAL;
+                goto out_err;
+        }
+        /* a 0 mask means we are explicitly removing the watch */
        if ((arg & ~DN_MULTISHOT) == 0) {
                dnotify_flush(filp, id);
-                return 0;
+                error = 0;
+                goto out_err;
        }
-        if (!dir_notify_enable)
-                return -EINVAL;
+        /* dnotify only works on directories */
        inode = filp->f_path.dentry->d_inode;
-        if (!S_ISDIR(inode->i_mode))
+        if (!S_ISDIR(inode->i_mode)) {
-                return -ENOTDIR;
+                error = -ENOTDIR;
-        dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
+                goto out_err;
-        if (dn == NULL)
-                return -ENOMEM;
-        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
-        while ((odn = *prev) != NULL) {
-                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-                        odn->dn_fd = fd;
-                        odn->dn_mask |= arg;
-                        inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-                        goto out_free;
-                }
-                prev = &odn->dn_next;
        }
-        rcu_read_lock();
+        /* expect most fcntl to add new rather than augment old */
-        f = fcheck(fd);
+        dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
-        rcu_read_unlock();
+        if (!dn) {
-        /* we'd lost the race with close(), sod off silently */
+                error = -ENOMEM;
-        /* note that inode->i_lock prevents reordering problems
+                goto out_err;
-         * between accesses to descriptor table and ->i_dnotify */
+        }
-        if (f != filp)
-                goto out_free;
-        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
-        if (error)
+        new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
-                goto out_free;
+        if (!new_dnentry) {
+                error = -ENOMEM;
+                goto out_err;
+        }
-        dn->dn_mask = arg;
+        /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
-        dn->dn_fd = fd;
+        mask = convert_arg(arg);
-        dn->dn_filp = filp;
-        dn->dn_owner = id;
-        inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-        dn->dn_next = inode->i_dnotify;
-        inode->i_dnotify = dn;
-        spin_unlock(&inode->i_lock);
-        return 0;
-out_free:
+        /* set up the new_entry and new_dnentry */
-        spin_unlock(&inode->i_lock);
+        new_entry = &new_dnentry->fsn_entry;
-        kmem_cache_free(dn_cache, dn);
+        fsnotify_init_mark(new_entry, dnotify_free_mark);
-        return error;
+        new_entry->mask = mask;
-}
+        new_dnentry->dn = NULL;
-void __inode_dir_notify(struct inode *inode, unsigned long event)
+        /* this is needed to prevent the fcntl/close race described below */
-{
+        mutex_lock(&dnotify_mark_mutex);
-        struct dnotify_struct * dn;
-        struct dnotify_struct **prev;
-        struct fown_struct *    fown;
-        int                     changed = 0;
+        /* add the new_entry or find an old one. */
        spin_lock(&inode->i_lock);
-        prev = &inode->i_dnotify;
+        entry = fsnotify_find_mark_entry(dnotify_group, inode);
-        while ((dn = *prev) != NULL) {
-                if ((dn->dn_mask & event) == 0) {
-                        prev = &dn->dn_next;
-                        continue;
-                }
-                fown = &dn->dn_filp->f_owner;
-                send_sigio(fown, dn->dn_fd, POLL_MSG);
-                if (dn->dn_mask & DN_MULTISHOT)
-                        prev = &dn->dn_next;
-                else {
-                        *prev = dn->dn_next;
-                        changed = 1;
-                        kmem_cache_free(dn_cache, dn);
-                }
-        }
-        if (changed)
-                redo_inode_mask(inode);
        spin_unlock(&inode->i_lock);
-}
+        if (entry) {
+                dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
-EXPORT_SYMBOL(__inode_dir_notify);
+                spin_lock(&entry->lock);
+        } else {
+                fsnotify_add_mark(new_entry, dnotify_group, inode);
+                spin_lock(&new_entry->lock);
+                entry = new_entry;
+                dnentry = new_dnentry;
+                /* we used new_entry, so don't free it */
+                new_entry = NULL;
+        }
-/*
+        rcu_read_lock();
- * This is hopelessly wrong, but unfixable without API changes.  At
+        f = fcheck(fd);
- * least it doesn't oops the kernel...
+        rcu_read_unlock();
- *
- * To safely access ->d_parent we need to keep d_move away from it.  Use the
- * dentry's d_lock for this.
- */
-void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-        struct dentry *parent;
-        if (!dir_notify_enable)
+        /* if (f != filp) means that we lost a race and another task/thread
-                return;
+         * actually closed the fd we are still playing with before we grabbed
+         * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
+         * only time we clean up the mark entries we need to get our mark off
+         * the list. */
+        if (f != filp) {
+                /* if we added ourselves, shoot ourselves, it's possible that
+                 * the flush actually did shoot this entry.  That's fine too
+                 * since multiple calls to destroy_mark is perfectly safe, if
+                 * we found a dnentry already attached to the inode, just sod
+                 * off silently as the flush at close time dealt with it.
+                 */
+                if (dnentry == new_dnentry)
+                        destroy = 1;
+                goto out;
+        }
-        spin_lock(&dentry->d_lock);
+        error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-        parent = dentry->d_parent;
+        if (error) {
-        if (parent->d_inode->i_dnotify_mask & event) {
+                /* if we added, we must shoot */
-                dget(parent);
+                if (dnentry == new_dnentry)
-                spin_unlock(&dentry->d_lock);
+                        destroy = 1;
-                __inode_dir_notify(parent->d_inode, event);
+                goto out;
-                dput(parent);
-        } else {
-                spin_unlock(&dentry->d_lock);
        }
+        error = attach_dn(dn, dnentry, id, fd, filp, mask);
+        /* !error means that we attached the dn to the dnentry, so don't free it */
+        if (!error)
+                dn = NULL;
+        /* -EEXIST means that we didn't add this new dn and used an old one.
+         * that isn't an error (and the unused dn should be freed) */
+        else if (error == -EEXIST)
+                error = 0;
+        dnotify_recalc_inode_mask(entry);
+out:
+        spin_unlock(&entry->lock);
+        if (destroy)
+                fsnotify_destroy_mark_by_entry(entry);
+        fsnotify_recalc_group_mask(dnotify_group);
+        mutex_unlock(&dnotify_mark_mutex);
+        fsnotify_put_mark(entry);
+out_err:
+        if (new_entry)
+                fsnotify_put_mark(new_entry);
+        if (dn)
+                kmem_cache_free(dnotify_struct_cache, dn);
+        return error;
 }
-EXPORT_SYMBOL_GPL(dnotify_parent);
 static int __init dnotify_init(void)
 {
-        dn_cache = kmem_cache_create("dnotify_cache",
+        dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
-                sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
+        dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
+        dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
+                                              0, &dnotify_fsnotify_ops);
+        if (IS_ERR(dnotify_group))
+                panic("unable to allocate fsnotify group for dnotify\n");
        return 0;
 }
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 000000000000..ec2f7bd76818
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,186 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+/*
+ * Clear all of the marks on an inode when it is being evicted from core
+ */
+void __fsnotify_inode_delete(struct inode *inode)
+{
+        fsnotify_clear_marks_by_inode(inode);
+}
+EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+/*
+ * Given an inode, first check if we care what happens to our children.  Inotify
+ * and dnotify both tell their parents about events.  If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+ * parent cares.  Thus when an event happens on a child it can quickly tell if
+ * if there is a need to find a parent and send the event to the parent.
+ */
+void __fsnotify_update_child_dentry_flags(struct inode *inode)
+{
+        struct dentry *alias;
+        int watched;
+        if (!S_ISDIR(inode->i_mode))
+                return;
+        /* determine if the children should tell inode about their events */
+        watched = fsnotify_inode_watches_children(inode);
+        spin_lock(&dcache_lock);
+        /* run all of the dentries associated with this inode.  Since this is a
+         * directory, there damn well better only be one item on this list */
+        list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+                struct dentry *child;
+                /* run all of the children of the original inode and fix their
+                 * d_flags to indicate parental interest (their parent is the
+                 * original inode) */
+                list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+                        if (!child->d_inode)
+                                continue;
+                        spin_lock(&child->d_lock);
+                        if (watched)
+                                child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+                        else
+                                child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+                        spin_unlock(&child->d_lock);
+                }
+        }
+        spin_unlock(&dcache_lock);
+}
+/* Notify this dentry's parent about a child's events. */
+void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+        struct dentry *parent;
+        struct inode *p_inode;
+        bool send = false;
+        bool should_update_children = false;
+        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+                return;
+        spin_lock(&dentry->d_lock);
+        parent = dentry->d_parent;
+        p_inode = parent->d_inode;
+        if (fsnotify_inode_watches_children(p_inode)) {
+                if (p_inode->i_fsnotify_mask & mask) {
+                        dget(parent);
+                        send = true;
+                }
+        } else {
+                /*
+                 * The parent doesn't care about events on it's children but
+                 * at least one child thought it did.  We need to run all the
+                 * children and update their d_flags to let them know p_inode
+                 * doesn't care about them any more.
+                 */
+                dget(parent);
+                should_update_children = true;
+        }
+        spin_unlock(&dentry->d_lock);
+        if (send) {
+                /* we are notifying a parent so come up with the new mask which
+                 * specifies these are events which came from a child. */
+                mask |= FS_EVENT_ON_CHILD;
+                fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+                         dentry->d_name.name, 0);
+                dput(parent);
+        }
+        if (unlikely(should_update_children)) {
+                __fsnotify_update_child_dentry_flags(p_inode);
+                dput(parent);
+        }
+}
+EXPORT_SYMBOL_GPL(__fsnotify_parent);
+/*
+ * This is the main call to fsnotify.  The VFS calls into hook specific functions
+ * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
+ * out to all of the registered fsnotify_group.  Those groups can then use the
+ * notification event in whatever means they feel necessary.
+ */
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
+{
+        struct fsnotify_group *group;
+        struct fsnotify_event *event = NULL;
+        int idx;
+        /* global tests shouldn't care about events on child only the specific event */
+        __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
+        if (list_empty(&fsnotify_groups))
+                return;
+        if (!(test_mask & fsnotify_mask))
+                return;
+        if (!(test_mask & to_tell->i_fsnotify_mask))
+                return;
+        /*
+         * SRCU!!  the groups list is very very much read only and the path is
+         * very hot.  The VAST majority of events are not going to need to do
+         * anything other than walk the list so it's crazy to pre-allocate.
+         */
+        idx = srcu_read_lock(&fsnotify_grp_srcu);
+        list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
+                if (test_mask & group->mask) {
+                        if (!group->ops->should_send_event(group, to_tell, mask))
+                                continue;
+                        if (!event) {
+                                event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
+                                /* shit, we OOM'd and now we can't tell, maybe
+                                 * someday someone else will want to do something
+                                 * here */
+                                if (!event)
+                                        break;
+                        }
+                        group->ops->handle_event(group, event);
+                }
+        }
+        srcu_read_unlock(&fsnotify_grp_srcu, idx);
+        /*
+         * fsnotify_create_event() took a reference so the event can't be cleaned
+         * up while we are still trying to add it to lists, drop that one.
+         */
+        if (event)
+                fsnotify_put_event(event);
+}
+EXPORT_SYMBOL_GPL(fsnotify);
+static __init int fsnotify_init(void)
+{
+        return init_srcu_struct(&fsnotify_grp_srcu);
+}
+subsys_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 000000000000..4dc240824b2d
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,34 @@
+#ifndef __FS_NOTIFY_FSNOTIFY_H_
+#define __FS_NOTIFY_FSNOTIFY_H_
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+/* protects reads of fsnotify_groups */
+extern struct srcu_struct fsnotify_grp_srcu;
+/* all groups which receive fsnotify events */
+extern struct list_head fsnotify_groups;
+/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
+extern __u32 fsnotify_mask;
+/* destroy all events sitting in this groups notification queue */
+extern void fsnotify_flush_notify(struct fsnotify_group *group);
+/* final kfree of a group */
+extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+/* run the list of all marks associated with inode and flag them to be freed */
+extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/*
+ * update the dentry->d_flags of all of inode's children to indicate if inode cares
+ * about events that happen to its children.
+ */
+extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+/* allocate and destroy and event holder to attach events to notification/access queues */
+extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+#endif  /* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 000000000000..0e1677144bc5
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,254 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/rculist.h>
+#include <linux/wait.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+#include <asm/atomic.h>
+/* protects writes to fsnotify_groups and fsnotify_mask */
+static DEFINE_MUTEX(fsnotify_grp_mutex);
+/* protects reads while running the fsnotify_groups list */
+struct srcu_struct fsnotify_grp_srcu;
+/* all groups registered to receive filesystem notifications */
+LIST_HEAD(fsnotify_groups);
+/* bitwise OR of all events (FS_*) interesting to some group on this system */
+__u32 fsnotify_mask;
+/*
+ * When a new group registers or changes it's set of interesting events
+ * this function updates the fsnotify_mask to contain all interesting events
+ */
+void fsnotify_recalc_global_mask(void)
+{
+        struct fsnotify_group *group;
+        __u32 mask = 0;
+        int idx;
+        idx = srcu_read_lock(&fsnotify_grp_srcu);
+        list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
+                mask |= group->mask;
+        srcu_read_unlock(&fsnotify_grp_srcu, idx);
+        fsnotify_mask = mask;
+}
+/*
+ * Update the group->mask by running all of the marks associated with this
+ * group and finding the bitwise | of all of the mark->mask.  If we change
+ * the group->mask we need to update the global mask of events interesting
+ * to the system.
+ */
+void fsnotify_recalc_group_mask(struct fsnotify_group *group)
+{
+        __u32 mask = 0;
+        __u32 old_mask = group->mask;
+        struct fsnotify_mark_entry *entry;
+        spin_lock(&group->mark_lock);
+        list_for_each_entry(entry, &group->mark_entries, g_list)
+                mask |= entry->mask;
+        spin_unlock(&group->mark_lock);
+        group->mask = mask;
+        if (old_mask != mask)
+                fsnotify_recalc_global_mask();
+}
+/*
+ * Take a reference to a group so things found under the fsnotify_grp_mutex
+ * can't get freed under us
+ */
+static void fsnotify_get_group(struct fsnotify_group *group)
+{
+        atomic_inc(&group->refcnt);
+}
+/*
+ * Final freeing of a group
+ */
+void fsnotify_final_destroy_group(struct fsnotify_group *group)
+{
+        /* clear the notification queue of all events */
+        fsnotify_flush_notify(group);
+        if (group->ops->free_group_priv)
+                group->ops->free_group_priv(group);
+        kfree(group);
+}
+/*
+ * Trying to get rid of a group.  We need to first get rid of any outstanding
+ * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group
+ * could miss marks that are being freed by inode and those marks could still
+ * hold a reference to this group (via group->num_marks)  If we get into that
+ * situtation, the fsnotify_final_destroy_group will get called when that final
+ * mark is freed.
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+        /* clear all inode mark entries for this group */
+        fsnotify_clear_marks_by_group(group);
+        /* past the point of no return, matches the initial value of 1 */
+        if (atomic_dec_and_test(&group->num_marks))
+                fsnotify_final_destroy_group(group);
+}
+/*
+ * Remove this group from the global list of groups that will get events
+ * this can be done even if there are still references and things still using
+ * this group.  This just stops the group from getting new events.
+ */
+static void __fsnotify_evict_group(struct fsnotify_group *group)
+{
+        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+        if (group->on_group_list)
+                list_del_rcu(&group->group_list);
+        group->on_group_list = 0;
+}
+/*
+ * Called when a group is no longer interested in getting events.  This can be
+ * used if a group is misbehaving or if for some reason a group should no longer
+ * get any filesystem events.
+ */
+void fsnotify_evict_group(struct fsnotify_group *group)
+{
+        mutex_lock(&fsnotify_grp_mutex);
+        __fsnotify_evict_group(group);
+        mutex_unlock(&fsnotify_grp_mutex);
+}
+/*
+ * Drop a reference to a group.  Free it if it's through.
+ */
+void fsnotify_put_group(struct fsnotify_group *group)
+{
+        if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
+                return;
+        /*
+         * OK, now we know that there's no other users *and* we hold mutex,
+         * so no new references will appear
+         */
+        __fsnotify_evict_group(group);
+        /*
+         * now it's off the list, so the only thing we might care about is
+         * srcu access....
+         */
+        mutex_unlock(&fsnotify_grp_mutex);
+        synchronize_srcu(&fsnotify_grp_srcu);
+        /* and now it is really dead. _Nothing_ could be seeing it */
+        fsnotify_recalc_global_mask();
+        fsnotify_destroy_group(group);
+}
+/*
+ * Simply run the fsnotify_groups list and find a group which matches
+ * the given parameters.  If a group is found we take a reference to that
+ * group.
+ */
+static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
+                                                  const struct fsnotify_ops *ops)
+{
+        struct fsnotify_group *group_iter;
+        struct fsnotify_group *group = NULL;
+        BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+        list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
+                if (group_iter->group_num == group_num) {
+                        if ((group_iter->mask == mask) &&
+                            (group_iter->ops == ops)) {
+                                fsnotify_get_group(group_iter);
+                                group = group_iter;
+                        } else
+                                group = ERR_PTR(-EEXIST);
+                }
+        }
+        return group;
+}
+/*
+ * Either finds an existing group which matches the group_num, mask, and ops or
+ * creates a new group and adds it to the global group list.  In either case we
+ * take a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
+                                             const struct fsnotify_ops *ops)
+{
+        struct fsnotify_group *group, *tgroup;
+        /* very low use, simpler locking if we just always alloc */
+        group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+        if (!group)
+                return ERR_PTR(-ENOMEM);
+        atomic_set(&group->refcnt, 1);
+        group->on_group_list = 0;
+        group->group_num = group_num;
+        group->mask = mask;
+        mutex_init(&group->notification_mutex);
+        INIT_LIST_HEAD(&group->notification_list);
+        init_waitqueue_head(&group->notification_waitq);
+        group->q_len = 0;
+        group->max_events = UINT_MAX;
+        spin_lock_init(&group->mark_lock);
+        atomic_set(&group->num_marks, 0);
+        INIT_LIST_HEAD(&group->mark_entries);
+        group->ops = ops;
+        mutex_lock(&fsnotify_grp_mutex);
+        tgroup = fsnotify_find_group(group_num, mask, ops);
+        if (tgroup) {
+                /* group already exists */
+                mutex_unlock(&fsnotify_grp_mutex);
+                /* destroy the new one we made */
+                fsnotify_put_group(group);
+                return tgroup;
+        }
+        /* group not found, add a new one */
+        list_add_rcu(&group->group_list, &fsnotify_groups);
+        group->on_group_list = 1;
+        /* being on the fsnotify_groups list holds one num_marks */
+        atomic_inc(&group->num_marks);
+        mutex_unlock(&fsnotify_grp_mutex);
+        if (mask)
+                fsnotify_recalc_global_mask();
+        return group;
+}
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 000000000000..c8a07c65482b
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,426 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * entry->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the mark_entries list anchored inside a given group
+ * and each entry is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
+ * given inode and each entry is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
+{
+        atomic_inc(&entry->refcnt);
+}
+void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
+{
+        if (atomic_dec_and_test(&entry->refcnt))
+                entry->free_mark(entry);
+}
+/*
+ * Recalculate the mask of events relevant to a given inode locked.
+ */
+static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry;
+        struct hlist_node *pos;
+        __u32 new_mask = 0;
+        assert_spin_locked(&inode->i_lock);
+        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
+                new_mask |= entry->mask;
+        inode->i_fsnotify_mask = new_mask;
+}
+/*
+ * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this inode.
+ */
+void fsnotify_recalc_inode_mask(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        fsnotify_recalc_inode_mask_locked(inode);
+        spin_unlock(&inode->i_lock);
+        __fsnotify_update_child_dentry_flags(inode);
+}
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the entry->lock
+ */
+void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
+{
+        struct fsnotify_group *group;
+        struct inode *inode;
+        spin_lock(&entry->lock);
+        group = entry->group;
+        inode = entry->inode;
+        BUG_ON(group && !inode);
+        BUG_ON(!group && inode);
+        /* if !group something else already marked this to die */
+        if (!group) {
+                spin_unlock(&entry->lock);
+                return;
+        }
+        /* 1 from caller and 1 for being on i_list/g_list */
+        BUG_ON(atomic_read(&entry->refcnt) < 2);
+        spin_lock(&group->mark_lock);
+        spin_lock(&inode->i_lock);
+        hlist_del_init(&entry->i_list);
+        entry->inode = NULL;
+        list_del_init(&entry->g_list);
+        entry->group = NULL;
+        fsnotify_put_mark(entry); /* for i_list and g_list */
+        /*
+         * this mark is now off the inode->i_fsnotify_mark_entries list and we
+         * hold the inode->i_lock, so this is the perfect time to update the
+         * inode->i_fsnotify_mask
+         */
+        fsnotify_recalc_inode_mask_locked(inode);
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&entry->lock);
+        /*
+         * Some groups like to know that marks are being freed.  This is a
+         * callback to the group function to let it know that this entry
+         * is being freed.
+         */
+        if (group->ops->freeing_mark)
+                group->ops->freeing_mark(entry, group);
+        /*
+         * __fsnotify_update_child_dentry_flags(inode);
+         *
+         * I really want to call that, but we can't, we have no idea if the inode
+         * still exists the second we drop the entry->lock.
+         *
+         * The next time an event arrive to this inode from one of it's children
+         * __fsnotify_parent will see that the inode doesn't care about it's
+         * children and will update all of these flags then.  So really this
+         * is just a lazy update (and could be a perf win...)
+         */
+        iput(inode);
+        /*
+         * it's possible that this group tried to destroy itself, but this
+         * this mark was simultaneously being freed by inode.  If that's the
+         * case, we finish freeing the group here.
+         */
+        if (unlikely(atomic_dec_and_test(&group->num_marks)))
+                fsnotify_final_destroy_group(group);
+}
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+        struct fsnotify_mark_entry *lentry, *entry;
+        LIST_HEAD(free_list);
+        spin_lock(&group->mark_lock);
+        list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
+                list_add(&entry->free_g_list, &free_list);
+                list_del_init(&entry->g_list);
+                fsnotify_get_mark(entry);
+        }
+        spin_unlock(&group->mark_lock);
+        list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+}
+/*
+ * Given an inode, destroy all of the marks associated with that inode.
+ */
+void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry, *lentry;
+        struct hlist_node *pos, *n;
+        LIST_HEAD(free_list);
+        spin_lock(&inode->i_lock);
+        hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
+                list_add(&entry->free_i_list, &free_list);
+                hlist_del_init(&entry->i_list);
+                fsnotify_get_mark(entry);
+        }
+        spin_unlock(&inode->i_lock);
+        list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+}
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
+                                                     struct inode *inode)
+{
+        struct fsnotify_mark_entry *entry;
+        struct hlist_node *pos;
+        assert_spin_locked(&inode->i_lock);
+        hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+                if (entry->group == group) {
+                        fsnotify_get_mark(entry);
+                        return entry;
+                }
+        }
+        return NULL;
+}
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
+                        void (*free_mark)(struct fsnotify_mark_entry *entry))
+{
+        spin_lock_init(&entry->lock);
+        atomic_set(&entry->refcnt, 1);
+        INIT_HLIST_NODE(&entry->i_list);
+        entry->group = NULL;
+        entry->mask = 0;
+        entry->inode = NULL;
+        entry->free_mark = free_mark;
+}
+/*
+ * Attach an initialized mark entry to a given group and inode.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group and for which inodes.
+ */
+int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
+                      struct fsnotify_group *group, struct inode *inode)
+{
+        struct fsnotify_mark_entry *lentry;
+        int ret = 0;
+        inode = igrab(inode);
+        if (unlikely(!inode))
+                return -EINVAL;
+        /*
+         * LOCKING ORDER!!!!
+         * entry->lock
+         * group->mark_lock
+         * inode->i_lock
+         */
+        spin_lock(&entry->lock);
+        spin_lock(&group->mark_lock);
+        spin_lock(&inode->i_lock);
+        entry->group = group;
+        entry->inode = inode;
+        lentry = fsnotify_find_mark_entry(group, inode);
+        if (!lentry) {
+                hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
+                list_add(&entry->g_list, &group->mark_entries);
+                fsnotify_get_mark(entry); /* for i_list and g_list */
+                atomic_inc(&group->num_marks);
+                fsnotify_recalc_inode_mask_locked(inode);
+        }
+        spin_unlock(&inode->i_lock);
+        spin_unlock(&group->mark_lock);
+        spin_unlock(&entry->lock);
+        if (lentry) {
+                ret = -EEXIST;
+                iput(inode);
+                fsnotify_put_mark(lentry);
+        } else {
+                __fsnotify_update_child_dentry_flags(inode);
+        }
+        return ret;
+}
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+        struct inode *inode, *next_i, *need_iput = NULL;
+        list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+                struct inode *need_iput_tmp;
+                /*
+                 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+                 * I_WILL_FREE, or I_NEW which is fine because by that point
+                 * the inode cannot have any associated watches.
+                 */
+                if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+                        continue;
+                /*
+                 * If i_count is zero, the inode cannot have any watches and
+                 * doing an __iget/iput with MS_ACTIVE clear would actually
+                 * evict all inodes with zero i_count from icache which is
+                 * unnecessarily violent and may in fact be illegal to do.
+                 */
+                if (!atomic_read(&inode->i_count))
+                        continue;
+                need_iput_tmp = need_iput;
+                need_iput = NULL;
+                /* In case fsnotify_inode_delete() drops a reference. */
+                if (inode != need_iput_tmp)
+                        __iget(inode);
+                else
+                        need_iput_tmp = NULL;
+                /* In case the dropping of a reference would nuke next_i. */
+                if ((&next_i->i_sb_list != list) &&
+                    atomic_read(&next_i->i_count) &&
+                    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+                        __iget(next_i);
+                        need_iput = next_i;
+                }
+                /*
+                 * We can safely drop inode_lock here because we hold
+                 * references on both inode and next_i.  Also no new inodes
+                 * will be added since the umount has begun.  Finally,
+                 * iprune_mutex keeps shrink_icache_memory() away.
+                 */
+                spin_unlock(&inode_lock);
+                if (need_iput_tmp)
+                        iput(need_iput_tmp);
+                /* for each watch, send FS_UNMOUNT and then remove it */
+                fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+                fsnotify_inode_delete(inode);
+                iput(inode);
+                spin_lock(&inode_lock);
+        }
+}
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 446792841023..5356884289a1 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,26 +1,30 @@
 config INOTIFY
        bool "Inotify file change notification support"
-        default y
+        default n
        ---help---
-          Say Y here to enable inotify support.  Inotify is a file change
+          Say Y here to enable legacy in kernel inotify support.  Inotify is a
-          notification system and a replacement for dnotify.  Inotify fixes
+          file change notification system.  It is a replacement for dnotify.
-          numerous shortcomings in dnotify and introduces several new features
+          This option only provides the legacy inotify in kernel API.  There
-          including multiple file events, one-shot support, and unmount
+          are no in tree kernel users of this interface since it is deprecated.
-          notification.
+          You only need this if you are loading an out of tree kernel module
+          that uses inotify.
          For more information, see <file:Documentation/filesystems/inotify.txt>
-          If unsure, say Y.
+          If unsure, say N.
 config INOTIFY_USER
        bool "Inotify support for userspace"
-        depends on INOTIFY
+        depends on FSNOTIFY
        default y
        ---help---
          Say Y here to enable inotify support for userspace, including the
          associated system calls.  Inotify allows monitoring of both files and
          directories via a single open fd.  Events are read from the file
          descriptor, which is also select()- and poll()-able.
+          Inotify fixes numerous shortcomings in dnotify and introduces several
+          new features including multiple file events, one-shot support, and
+          unmount notification.
          For more information, see <file:Documentation/filesystems/inotify.txt>
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index e290f3bb9d8d..943828171362 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INOTIFY)           += inotify.o
-obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
+obj-$(CONFIG_INOTIFY_USER)      += inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 220c13f0d73d..40b1cf914ccb 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -32,6 +32,7 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 static atomic_t inotify_cookie;
@@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
 */
 static int __init inotify_setup(void)
 {
+        BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+        BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+        BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+        BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+        BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+        BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+        BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+        BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+        BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+        BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+        BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+        BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+        BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+        BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+        BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+        BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+        BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
        atomic_set(&inotify_cookie, 0);
        return 0;
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 000000000000..ea2605a58b8a
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,21 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/slab.h> /* struct kmem_cache */
+extern struct kmem_cache *event_priv_cachep;
+struct inotify_event_private_data {
+        struct fsnotify_event_private_data fsnotify_event_priv_data;
+        int wd;
+};
+struct inotify_inode_mark_entry {
+        /* fsnotify_mark_entry MUST be the first thing */
+        struct fsnotify_mark_entry fsn_entry;
+        int wd;
+};
+extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 000000000000..7ef75b83247e
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,138 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *      John McCutchan  <ttb@tentacle.dhs.org>
+ *      Robert Love     <rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/path.h> /* struct path */
+#include <linux/slab.h> /* kmem_* */
+#include <linux/types.h>
+#include "inotify.h"
+static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark_entry *ientry;
+        struct inode *to_tell;
+        struct inotify_event_private_data *event_priv;
+        struct fsnotify_event_private_data *fsn_event_priv;
+        int wd, ret;
+        to_tell = event->to_tell;
+        spin_lock(&to_tell->i_lock);
+        entry = fsnotify_find_mark_entry(group, to_tell);
+        spin_unlock(&to_tell->i_lock);
+        /* race with watch removal?  We already passes should_send */
+        if (unlikely(!entry))
+                return 0;
+        ientry = container_of(entry, struct inotify_inode_mark_entry,
+                              fsn_entry);
+        wd = ientry->wd;
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        if (unlikely(!event_priv))
+                return -ENOMEM;
+        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event_priv->group = group;
+        event_priv->wd = wd;
+        ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
+        /* EEXIST is not an error */
+        if (ret == -EEXIST)
+                ret = 0;
+        /* did event_priv get attached? */
+        if (list_empty(&fsn_event_priv->event_list))
+                inotify_free_event_priv(fsn_event_priv);
+        /*
+         * If we hold the entry until after the event is on the queue
+         * IN_IGNORED won't be able to pass this event in the queue
+         */
+        fsnotify_put_mark(entry);
+        return ret;
+}
+static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+        inotify_destroy_mark_entry(entry, group);
+}
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+{
+        struct fsnotify_mark_entry *entry;
+        bool send;
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        if (!entry)
+                return false;
+        mask = (mask & ~FS_EVENT_ON_CHILD);
+        send = (entry->mask & mask);
+        /* find took a reference */
+        fsnotify_put_mark(entry);
+        return send;
+}
+static int idr_callback(int id, void *p, void *data)
+{
+        BUG();
+        return 0;
+}
+static void inotify_free_group_priv(struct fsnotify_group *group)
+{
+        /* ideally the idr is empty and we won't hit the BUG in teh callback */
+        idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+        idr_remove_all(&group->inotify_data.idr);
+        idr_destroy(&group->inotify_data.idr);
+}
+void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+{
+        struct inotify_event_private_data *event_priv;
+        event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
+                                  fsnotify_event_priv_data);
+        kmem_cache_free(event_priv_cachep, event_priv);
+}
+const struct fsnotify_ops inotify_fsnotify_ops = {
+        .handle_event = inotify_handle_event,
+        .should_send_event = inotify_should_send_event,
+        .free_group_priv = inotify_free_group_priv,
+        .free_event_priv = inotify_free_event_priv,
+        .freeing_mark = inotify_freeing_mark,
+};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1634319e2404..982a412ac5bc 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -8,6 +8,9 @@
 * Copyright (C) 2005 John McCutchan
 * Copyright 2006 Hewlett-Packard Development Company, L.P.
 *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
@@ -19,94 +22,48 @@
 * General Public License for more details.
 */
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
+#include <linux/fs.h> /* struct inode */
-#include <linux/namei.h>
+#include <linux/fsnotify_backend.h>
-#include <linux/poll.h>
+#include <linux/idr.h>
-#include <linux/init.h>
+#include <linux/init.h> /* module_init */
-#include <linux/list.h>
 #include <linux/inotify.h>
+#include <linux/kernel.h> /* roundup() */
+#include <linux/magic.h> /* superblock magic number */
+#include <linux/mount.h> /* mntget */
+#include <linux/namei.h> /* LOOKUP_FOLLOW */
+#include <linux/path.h> /* struct path */
+#include <linux/sched.h> /* struct user */
+#include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
-#include <linux/magic.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
-#include <asm/ioctls.h>
+#include "inotify.h"
-static struct kmem_cache *watch_cachep __read_mostly;
+#include <asm/ioctls.h>
-static struct kmem_cache *event_cachep __read_mostly;
 static struct vfsmount *inotify_mnt __read_mostly;
+/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
+static struct inotify_event nul_inotify_event;
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 static int inotify_max_queued_events __read_mostly;
+int inotify_max_user_watches __read_mostly;
-/*
+static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
- * Lock ordering:
+struct kmem_cache *event_priv_cachep __read_mostly;
- *
+static struct fsnotify_event *inotify_ignored_event;
- * inotify_dev->up_mutex (ensures we don't re-add the same watch)
- *      inode->inotify_mutex (protects inode's watch list)
- *              inotify_handle->mutex (protects inotify_handle's watch list)
- *                      inotify_dev->ev_mutex (protects device's event queue)
- */
 /*
- * Lifetimes of the main data structures:
+ * When inotify registers a new group it increments this and uses that
- *
+ * value as an offset to set the fsnotify group "name" and priority.
- * inotify_device: Lifetime is managed by reference count, from
- * sys_inotify_init() until release.  Additional references can bump the count
- * via get_inotify_dev() and drop the count via put_inotify_dev().
- *
- * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
- * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
- * first event, or to inotify_destroy().
 */
+static atomic_t inotify_grp_num;
-/*
- * struct inotify_device - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_device {
-        wait_queue_head_t       wq;             /* wait queue for i/o */
-        struct mutex            ev_mutex;       /* protects event queue */
-        struct mutex            up_mutex;       /* synchronizes watch updates */
-        struct list_head        events;         /* list of queued events */
-        struct user_struct      *user;          /* user who opened this dev */
-        struct inotify_handle   *ih;            /* inotify handle */
-        struct fasync_struct    *fa;            /* async notification */
-        atomic_t                count;          /* reference count */
-        unsigned int            queue_size;     /* size of the queue (bytes) */
-        unsigned int            event_count;    /* number of pending events */
-        unsigned int            max_events;     /* maximum number of events */
-};
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->ev_mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-        struct inotify_event    event;  /* the user-space event */
-        struct list_head        list;   /* entry in inotify_device's list */
-        char                    *name;  /* filename, if any */
-};
-/*
- * struct inotify_user_watch - our version of an inotify_watch, we add
- * a reference to the associated inotify_device.
- */
-struct inotify_user_watch {
-        struct inotify_device   *dev;   /* associated device */
-        struct inotify_watch    wdata;  /* inotify watch data */
-};
 #ifdef CONFIG_SYSCTL
@@ -149,280 +106,36 @@ ctl_table inotify_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
-static inline void get_inotify_dev(struct inotify_device *dev)
+static inline __u32 inotify_arg_to_mask(u32 arg)
-{
-        atomic_inc(&dev->count);
-}
-static inline void put_inotify_dev(struct inotify_device *dev)
-{
-        if (atomic_dec_and_test(&dev->count)) {
-                atomic_dec(&dev->user->inotify_devs);
-                free_uid(dev->user);
-                kfree(dev);
-        }
-}
-/*
- * free_inotify_user_watch - cleans up the watch and its references
- */
-static void free_inotify_user_watch(struct inotify_watch *w)
-{
-        struct inotify_user_watch *watch;
-        struct inotify_device *dev;
-        watch = container_of(w, struct inotify_user_watch, wdata);
-        dev = watch->dev;
-        atomic_dec(&dev->user->inotify_watches);
-        put_inotify_dev(dev);
-        kmem_cache_free(watch_cachep, watch);
-}
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-                                                  const char *name)
-{
-        struct inotify_kernel_event *kevent;
-        kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
-        if (unlikely(!kevent))
-                return NULL;
-        /* we hand this out to user-space, so zero it just in case */
-        memset(&kevent->event, 0, sizeof(struct inotify_event));
-        kevent->event.wd = wd;
-        kevent->event.mask = mask;
-        kevent->event.cookie = cookie;
-        INIT_LIST_HEAD(&kevent->list);
-        if (name) {
-                size_t len, rem, event_size = sizeof(struct inotify_event);
-                /*
-                 * We need to pad the filename so as to properly align an
-                 * array of inotify_event structures.  Because the structure is
-                 * small and the common case is a small filename, we just round
-                 * up to the next multiple of the structure's sizeof.  This is
-                 * simple and safe for all architectures.
-                 */
-                len = strlen(name) + 1;
-                rem = event_size - len;
-                if (len > event_size) {
-                        rem = event_size - (len % event_size);
-                        if (len % event_size == 0)
-                                rem = 0;
-                }
-                kevent->name = kmalloc(len + rem, GFP_NOFS);
-                if (unlikely(!kevent->name)) {
-                        kmem_cache_free(event_cachep, kevent);
-                        return NULL;
-                }
-                memcpy(kevent->name, name, len);
-                if (rem)
-                        memset(kevent->name + len, 0, rem);
-                kevent->event.len = len + rem;
-        } else {
-                kevent->event.len = 0;
-                kevent->name = NULL;
-        }
-        return kevent;
-}
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-        return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-/*
- * inotify_dev_get_last_event - return the last event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_last_event(struct inotify_device *dev)
 {
-        if (list_empty(&dev->events))
+        __u32 mask;
-                return NULL;
-        return list_entry(dev->events.prev, struct inotify_kernel_event, list);
-}
-/*
+        /* everything should accept their own ignored and cares about children */
- * inotify_dev_queue_event - event handler registered with core inotify, adds
+        mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
- * a new event to the given device
- *
- * Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-                                    u32 cookie, const char *name,
-                                    struct inode *ignored)
-{
-        struct inotify_user_watch *watch;
-        struct inotify_device *dev;
-        struct inotify_kernel_event *kevent, *last;
-        watch = container_of(w, struct inotify_user_watch, wdata);
+        /* mask off the flags used to open the fd */
-        dev = watch->dev;
+        mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
-        mutex_lock(&dev->ev_mutex);
+        return mask;
-        /* we can safely put the watch as we don't reference it while
-         * generating the event
-         */
-        if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
-                put_inotify_watch(w); /* final put */
-        /* coalescing: drop this event if it is a dupe of the previous */
-        last = inotify_dev_get_last_event(dev);
-        if (last && last->event.mask == mask && last->event.wd == wd &&
-                        last->event.cookie == cookie) {
-                const char *lastname = last->name;
-                if (!name && !lastname)
-                        goto out;
-                if (name && lastname && !strcmp(lastname, name))
-                        goto out;
-        }
-        /* the queue overflowed and we already sent the Q_OVERFLOW event */
-        if (unlikely(dev->event_count > dev->max_events))
-                goto out;
-        /* if the queue overflows, we need to notify user space */
-        if (unlikely(dev->event_count == dev->max_events))
-                kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-        else
-                kevent = kernel_event(wd, mask, cookie, name);
-        if (unlikely(!kevent))
-                goto out;
-        /* queue the event and wake up anyone waiting */
-        dev->event_count++;
-        dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-        list_add_tail(&kevent->list, &dev->events);
-        wake_up_interruptible(&dev->wq);
-        kill_fasync(&dev->fa, SIGIO, POLL_IN);
-out:
-        mutex_unlock(&dev->ev_mutex);
-}
-/*
- * remove_kevent - cleans up the given kevent
- *
- * Caller must hold dev->ev_mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-                          struct inotify_kernel_event *kevent)
-{
-        list_del(&kevent->list);
-        dev->event_count--;
-        dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-}
-/*
- * free_kevent - frees the given kevent.
- */
-static void free_kevent(struct inotify_kernel_event *kevent)
-{
-        kfree(kevent->name);
-        kmem_cache_free(event_cachep, kevent);
-}
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->ev_mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-        if (!list_empty(&dev->events)) {
-                struct inotify_kernel_event *kevent;
-                kevent = inotify_dev_get_event(dev);
-                remove_kevent(dev, kevent);
-                free_kevent(kevent);
-        }
-}
-/*
- * find_inode - resolve a user-given path to a specific inode
- */
-static int find_inode(const char __user *dirname, struct path *path,
-                      unsigned flags)
-{
-        int error;
-        error = user_path_at(AT_FDCWD, dirname, flags, path);
-        if (error)
-                return error;
-        /* you can only watch an inode if you have read permissions on it */
-        error = inode_permission(path->dentry->d_inode, MAY_READ);
-        if (error)
-                path_put(path);
-        return error;
 }
-/*
+static inline u32 inotify_mask_to_arg(__u32 mask)
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->up_mutex.
- */
-static int create_watch(struct inotify_device *dev, struct inode *inode,
-                        u32 mask)
 {
-        struct inotify_user_watch *watch;
+        return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
-        int ret;
+                       IN_Q_OVERFLOW);
-        if (atomic_read(&dev->user->inotify_watches) >=
-                        inotify_max_user_watches)
-                return -ENOSPC;
-        watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-        if (unlikely(!watch))
-                return -ENOMEM;
-        /* save a reference to device and bump the count to make it official */
-        get_inotify_dev(dev);
-        watch->dev = dev;
-        atomic_inc(&dev->user->inotify_watches);
-        inotify_init_watch(&watch->wdata);
-        ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
-        if (ret < 0)
-                free_inotify_user_watch(&watch->wdata);
-        return ret;
 }
-/* Device Interface */
+/* intofiy userspace file descriptor functions */
 static unsigned int inotify_poll(struct file *file, poll_table *wait)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
        int ret = 0;
-        poll_wait(file, &dev->wq, wait);
+        poll_wait(file, &group->notification_waitq, wait);
-        mutex_lock(&dev->ev_mutex);
+        mutex_lock(&group->notification_mutex);
-        if (!list_empty(&dev->events))
+        if (!fsnotify_notify_queue_is_empty(group))
                ret = POLLIN | POLLRDNORM;
-        mutex_unlock(&dev->ev_mutex);
+        mutex_unlock(&group->notification_mutex);
        return ret;
 }
@@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 * enough to fit in "count". Return an error pointer if
 * not large enough.
 *
- * Called with the device ev_mutex held.
+ * Called with the group->notification_mutex held.
 */
-static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
-                                                  size_t count)
+                                            size_t count)
 {
        size_t event_size = sizeof(struct inotify_event);
-        struct inotify_kernel_event *kevent;
+        struct fsnotify_event *event;
-        if (list_empty(&dev->events))
+        if (fsnotify_notify_queue_is_empty(group))
                return NULL;
-        kevent = inotify_dev_get_event(dev);
+        event = fsnotify_peek_notify_event(group);
-        if (kevent->name)
-                event_size += kevent->event.len;
+        event_size += roundup(event->name_len, event_size);
        if (event_size > count)
                return ERR_PTR(-EINVAL);
-        remove_kevent(dev, kevent);
+        /* held the notification_mutex the whole time, so this is the
-        return kevent;
+         * same event we peeked above */
+        fsnotify_remove_notify_event(group);
+        return event;
 }
 /*
@@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
 * We already checked that the event size is smaller than the
 * buffer we had in "get_one_event()" above.
 */
-static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+                                  struct fsnotify_event *event,
                                  char __user *buf)
 {
+        struct inotify_event inotify_event;
+        struct fsnotify_event_private_data *fsn_priv;
+        struct inotify_event_private_data *priv;
        size_t event_size = sizeof(struct inotify_event);
+        size_t name_len;
+        /* we get the inotify watch descriptor from the event private data */
+        spin_lock(&event->lock);
+        fsn_priv = fsnotify_remove_priv_from_event(group, event);
+        spin_unlock(&event->lock);
+        if (!fsn_priv)
+                inotify_event.wd = -1;
+        else {
+                priv = container_of(fsn_priv, struct inotify_event_private_data,
+                                    fsnotify_event_priv_data);
+                inotify_event.wd = priv->wd;
+                inotify_free_event_priv(fsn_priv);
+        }
+        /* round up event->name_len so it is a multiple of event_size */
+        name_len = roundup(event->name_len, event_size);
+        inotify_event.len = name_len;
+        inotify_event.mask = inotify_mask_to_arg(event->mask);
+        inotify_event.cookie = event->sync_cookie;
-        if (copy_to_user(buf, &kevent->event, event_size))
+        /* send the main event */
+        if (copy_to_user(buf, &inotify_event, event_size))
                return -EFAULT;
-        if (kevent->name) {
+        buf += event_size;
-                buf += event_size;
-                if (copy_to_user(buf, kevent->name, kevent->event.len))
+        /*
+         * fsnotify only stores the pathname, so here we have to send the pathname
+         * and then pad that pathname out to a multiple of sizeof(inotify_event)
+         * with zeros.  I get my zeros from the nul_inotify_event.
+         */
+        if (name_len) {
+                unsigned int len_to_zero = name_len - event->name_len;
+                /* copy the path name */
+                if (copy_to_user(buf, event->file_name, event->name_len))
                        return -EFAULT;
+                buf += event->name_len;
-                event_size += kevent->event.len;
+                /* fill userspace with 0's from nul_inotify_event */
+                if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+                        return -EFAULT;
+                buf += len_to_zero;
+                event_size += name_len;
        }
        return event_size;
 }
 static ssize_t inotify_read(struct file *file, char __user *buf,
                            size_t count, loff_t *pos)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
+        struct fsnotify_event *kevent;
        char __user *start;
        int ret;
        DEFINE_WAIT(wait);
        start = buf;
-        dev = file->private_data;
+        group = file->private_data;
        while (1) {
-                struct inotify_kernel_event *kevent;
+                prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
-                prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+                mutex_lock(&group->notification_mutex);
+                kevent = get_one_event(group, count);
-                mutex_lock(&dev->ev_mutex);
+                mutex_unlock(&group->notification_mutex);
-                kevent = get_one_event(dev, count);
-                mutex_unlock(&dev->ev_mutex);
                if (kevent) {
                        ret = PTR_ERR(kevent);
                        if (IS_ERR(kevent))
                                break;
-                        ret = copy_event_to_user(kevent, buf);
+                        ret = copy_event_to_user(group, kevent, buf);
-                        free_kevent(kevent);
+                        fsnotify_put_event(kevent);
                        if (ret < 0)
                                break;
                        buf += ret;
@@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
                schedule();
        }
-        finish_wait(&dev->wq, &wait);
+        finish_wait(&group->notification_waitq, &wait);
        if (start != buf && ret != -EFAULT)
                ret = buf - start;
        return ret;
@@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 static int inotify_fasync(int fd, struct file *file, int on)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
-        return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
+        return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
 }
 static int inotify_release(struct inode *ignored, struct file *file)
 {
-        struct inotify_device *dev = file->private_data;
+        struct fsnotify_group *group = file->private_data;
-        inotify_destroy(dev->ih);
-        /* destroy all of the events on this device */
+        fsnotify_clear_marks_by_group(group);
-        mutex_lock(&dev->ev_mutex);
-        while (!list_empty(&dev->events))
-                inotify_dev_event_dequeue(dev);
-        mutex_unlock(&dev->ev_mutex);
-        /* free this device: the put matching the get in inotify_init() */
+        /* free this group, matching get was inotify_init->fsnotify_obtain_group */
-        put_inotify_dev(dev);
+        fsnotify_put_group(group);
        return 0;
 }
@@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
 static long inotify_ioctl(struct file *file, unsigned int cmd,
                          unsigned long arg)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
+        struct fsnotify_event_holder *holder;
+        struct fsnotify_event *event;
        void __user *p;
        int ret = -ENOTTY;
+        size_t send_len = 0;
-        dev = file->private_data;
+        group = file->private_data;
        p = (void __user *) arg;
        switch (cmd) {
        case FIONREAD:
-                ret = put_user(dev->queue_size, (int __user *) p);
+                mutex_lock(&group->notification_mutex);
+                list_for_each_entry(holder, &group->notification_list, event_list) {
+                        event = holder->event;
+                        send_len += sizeof(struct inotify_event);
+                        send_len += roundup(event->name_len,
+                                             sizeof(struct inotify_event));
+                }
+                mutex_unlock(&group->notification_mutex);
+                ret = put_user(send_len, (int __user *) p);
                break;
        }
@@ -576,23 +336,233 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 static const struct file_operations inotify_fops = {
-        .poll           = inotify_poll,
+        .poll           = inotify_poll,
-        .read           = inotify_read,
+        .read           = inotify_read,
-        .fasync         = inotify_fasync,
+        .fasync         = inotify_fasync,
-        .release        = inotify_release,
+        .release        = inotify_release,
-        .unlocked_ioctl = inotify_ioctl,
+        .unlocked_ioctl = inotify_ioctl,
        .compat_ioctl   = inotify_ioctl,
 };
-static const struct inotify_operations inotify_user_ops = {
-        .handle_event   = inotify_dev_queue_event,
-        .destroy_watch  = free_inotify_user_watch,
-};
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+{
+        int error;
+        error = user_path_at(AT_FDCWD, dirname, flags, path);
+        if (error)
+                return error;
+        /* you can only watch an inode if you have read permissions on it */
+        error = inode_permission(path->dentry->d_inode, MAY_READ);
+        if (error)
+                path_put(path);
+        return error;
+}
+/*
+ * When, for whatever reason, inotify is done with a mark (or what used to be a
+ * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
+ * for the given wd.
+ *
+ * There is a bit of recursion here.  The loop looks like:
+ *      inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
+ *      inotify_freeing_mark -> inotify_destory_mark_entry -> restart
+ * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
+ * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
+ * test below will not call back to fsnotify again.  But even if that test wasn't
+ * there this would still be safe since fsnotify_destroy_mark_by_entry() is
+ * safe from recursion.
+ */
+void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+        struct inotify_inode_mark_entry *ientry;
+        struct inotify_event_private_data *event_priv;
+        struct fsnotify_event_private_data *fsn_event_priv;
+        struct fsnotify_group *egroup;
+        struct idr *idr;
+        spin_lock(&entry->lock);
+        egroup = entry->group;
+        /* if egroup we aren't really done and something might still send events
+         * for this inode, on the callback we'll send the IN_IGNORED */
+        if (egroup) {
+                spin_unlock(&entry->lock);
+                fsnotify_destroy_mark_by_entry(entry);
+                return;
+        }
+        spin_unlock(&entry->lock);
+        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        if (unlikely(!event_priv))
+                goto skip_send_ignore;
+        fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+        fsn_event_priv->group = group;
+        event_priv->wd = ientry->wd;
+        fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+        /* did the private data get added? */
+        if (list_empty(&fsn_event_priv->event_list))
+                inotify_free_event_priv(fsn_event_priv);
+skip_send_ignore:
+        /* remove this entry from the idr */
+        spin_lock(&group->inotify_data.idr_lock);
+        idr = &group->inotify_data.idr;
+        idr_remove(idr, ientry->wd);
+        spin_unlock(&group->inotify_data.idr_lock);
+        /* removed from idr, drop that reference */
+        fsnotify_put_mark(entry);
+}
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+        struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+        kmem_cache_free(inotify_inode_mark_cachep, ientry);
+}
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+        struct fsnotify_mark_entry *entry = NULL;
+        struct inotify_inode_mark_entry *ientry;
+        int ret = 0;
+        int add = (arg & IN_MASK_ADD);
+        __u32 mask;
+        __u32 old_mask, new_mask;
+        /* don't allow invalid bits: we don't want flags set */
+        mask = inotify_arg_to_mask(arg);
+        if (unlikely(!mask))
+                return -EINVAL;
+        ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+        if (unlikely(!ientry))
+                return -ENOMEM;
+        /* we set the mask at the end after attaching it */
+        fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
+        ientry->wd = 0;
+find_entry:
+        spin_lock(&inode->i_lock);
+        entry = fsnotify_find_mark_entry(group, inode);
+        spin_unlock(&inode->i_lock);
+        if (entry) {
+                kmem_cache_free(inotify_inode_mark_cachep, ientry);
+                ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        } else {
+                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
+                        ret = -ENOSPC;
+                        goto out_err;
+                }
+                ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
+                if (ret == -EEXIST)
+                        goto find_entry;
+                else if (ret)
+                        goto out_err;
+                entry = &ientry->fsn_entry;
+retry:
+                ret = -ENOMEM;
+                if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+                        goto out_err;
+                spin_lock(&group->inotify_data.idr_lock);
+                /* if entry is added to the idr we keep the reference obtained
+                 * through fsnotify_mark_add.  remember to drop this reference
+                 * when entry is removed from idr */
+                ret = idr_get_new_above(&group->inotify_data.idr, entry,
+                                        ++group->inotify_data.last_wd,
+                                        &ientry->wd);
+                spin_unlock(&group->inotify_data.idr_lock);
+                if (ret) {
+                        if (ret == -EAGAIN)
+                                goto retry;
+                        goto out_err;
+                }
+                atomic_inc(&group->inotify_data.user->inotify_watches);
+        }
+        spin_lock(&entry->lock);
+        old_mask = entry->mask;
+        if (add) {
+                entry->mask |= mask;
+                new_mask = entry->mask;
+        } else {
+                entry->mask = mask;
+                new_mask = entry->mask;
+        }
+        spin_unlock(&entry->lock);
+        if (old_mask != new_mask) {
+                /* more bits in old than in new? */
+                int dropped = (old_mask & ~new_mask);
+                /* more bits in this entry than the inode's mask? */
+                int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+                /* more bits in this entry than the group? */
+                int do_group = (new_mask & ~group->mask);
+                /* update the inode with this new entry */
+                if (dropped || do_inode)
+                        fsnotify_recalc_inode_mask(inode);
+                /* update the group mask with the new mask */
+                if (dropped || do_group)
+                        fsnotify_recalc_group_mask(group);
+        }
+        return ientry->wd;
+out_err:
+        /* see this isn't supposed to happen, just kill the watch */
+        if (entry) {
+                fsnotify_destroy_mark_by_entry(entry);
+                fsnotify_put_mark(entry);
+        }
+        return ret;
+}
+static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
+{
+        struct fsnotify_group *group;
+        unsigned int grp_num;
+        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+        grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
+        group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
+        if (IS_ERR(group))
+                return group;
+        group->max_events = max_events;
+        spin_lock_init(&group->inotify_data.idr_lock);
+        idr_init(&group->inotify_data.idr);
+        group->inotify_data.last_wd = 0;
+        group->inotify_data.user = user;
+        group->inotify_data.fa = NULL;
+        return group;
+}
+/* inotify syscalls */
 SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
-        struct inotify_device *dev;
+        struct fsnotify_group *group;
-        struct inotify_handle *ih;
        struct user_struct *user;
        struct file *filp;
        int fd, ret;
@@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
                goto out_free_uid;
        }
-        dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+        /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
-        if (unlikely(!dev)) {
+        group = inotify_new_group(user, inotify_max_queued_events);
-                ret = -ENOMEM;
+        if (IS_ERR(group)) {
+                ret = PTR_ERR(group);
                goto out_free_uid;
        }
-        ih = inotify_init(&inotify_user_ops);
-        if (IS_ERR(ih)) {
-                ret = PTR_ERR(ih);
-                goto out_free_dev;
-        }
-        dev->ih = ih;
-        dev->fa = NULL;
        filp->f_op = &inotify_fops;
        filp->f_path.mnt = mntget(inotify_mnt);
        filp->f_path.dentry = dget(inotify_mnt->mnt_root);
        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
        filp->f_mode = FMODE_READ;
        filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-        filp->private_data = dev;
+        filp->private_data = group;
-        INIT_LIST_HEAD(&dev->events);
-        init_waitqueue_head(&dev->wq);
-        mutex_init(&dev->ev_mutex);
-        mutex_init(&dev->up_mutex);
-        dev->event_count = 0;
-        dev->queue_size = 0;
-        dev->max_events = inotify_max_queued_events;
-        dev->user = user;
-        atomic_set(&dev->count, 0);
-        get_inotify_dev(dev);
        atomic_inc(&user->inotify_devs);
        fd_install(fd, filp);
        return fd;
-out_free_dev:
-        kfree(dev);
 out_free_uid:
        free_uid(user);
        put_filp(filp);
@@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init)
 SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
                u32, mask)
 {
+        struct fsnotify_group *group;
        struct inode *inode;
-        struct inotify_device *dev;
        struct path path;
        struct file *filp;
        int ret, fput_needed;
@@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
        if (mask & IN_ONLYDIR)
                flags |= LOOKUP_DIRECTORY;
-        ret = find_inode(pathname, &path, flags);
+        ret = inotify_find_inode(pathname, &path, flags);
-        if (unlikely(ret))
+        if (ret)
                goto fput_and_out;
-        /* inode held in place by reference to path; dev by fget on fd */
+        /* inode held in place by reference to path; group by fget on fd */
        inode = path.dentry->d_inode;
-        dev = filp->private_data;
+        group = filp->private_data;
-        mutex_lock(&dev->up_mutex);
+        /* create/update an inode mark */
-        ret = inotify_find_update_watch(dev->ih, inode, mask);
+        ret = inotify_update_watch(group, inode, mask);
-        if (ret == -ENOENT)
+        if (unlikely(ret))
-                ret = create_watch(dev, inode, mask);
+                goto path_put_and_out;
-        mutex_unlock(&dev->up_mutex);
+path_put_and_out:
        path_put(&path);
 fput_and_out:
        fput_light(filp, fput_needed);
@@ -720,9 +672,10 @@ fput_and_out:
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
+        struct fsnotify_group *group;
+        struct fsnotify_mark_entry *entry;
        struct file *filp;
-        struct inotify_device *dev;
+        int ret = 0, fput_needed;
-        int ret, fput_needed;
        filp = fget_light(fd, &fput_needed);
        if (unlikely(!filp))
@@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
                goto out;
        }
-        dev = filp->private_data;
+        group = filp->private_data;
-        /* we free our watch data when we get IN_IGNORED */
+        spin_lock(&group->inotify_data.idr_lock);
-        ret = inotify_rm_wd(dev->ih, wd);
+        entry = idr_find(&group->inotify_data.idr, wd);
+        if (unlikely(!entry)) {
+                spin_unlock(&group->inotify_data.idr_lock);
+                ret = -EINVAL;
+                goto out;
+        }
+        fsnotify_get_mark(entry);
+        spin_unlock(&group->inotify_data.idr_lock);
+        inotify_destroy_mark_entry(entry, group);
+        fsnotify_put_mark(entry);
 out:
        fput_light(filp, fput_needed);
@@ -753,9 +716,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
 }
 static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
+    .name       = "inotifyfs",
-    .get_sb         = inotify_get_sb,
+    .get_sb     = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
+    .kill_sb    = kill_anon_super,
 };
 /*
@@ -775,18 +738,16 @@ static int __init inotify_user_setup(void)
        if (IS_ERR(inotify_mnt))
                panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
+        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
+        inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+        if (!inotify_ignored_event)
+                panic("unable to allocate the inotify ignored event\n");
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
        inotify_max_user_watches = 8192;
-        watch_cachep = kmem_cache_create("inotify_watch_cache",
-                                         sizeof(struct inotify_user_watch),
-                                         0, SLAB_PANIC, NULL);
-        event_cachep = kmem_cache_create("inotify_event_cache",
-                                         sizeof(struct inotify_kernel_event),
-                                         0, SLAB_PANIC, NULL);
        return 0;
 }
 module_init(inotify_user_setup);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 000000000000..959b73e756fd
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,411 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Basic idea behind the notification queue: An fsnotify group (like inotify)
+ * sends the userspace notification about events asyncronously some time after
+ * the event happened.  When inotify gets an event it will need to add that
+ * event to the group notify queue.  Since a single event might need to be on
+ * multiple group's notification queues we can't add the event directly to each
+ * queue and instead add a small "event_holder" to each queue.  This event_holder
+ * has a pointer back to the original event.  Since the majority of events are
+ * going to end up on one, and only one, notification queue we embed one
+ * event_holder into each event.  This means we have a single allocation instead
+ * of always needing two.  If the embedded event_holder is already in use by
+ * another group a new event_holder (from fsnotify_event_holder_cachep) will be
+ * allocated and used.
+ */
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+static struct kmem_cache *fsnotify_event_cachep;
+static struct kmem_cache *fsnotify_event_holder_cachep;
+/*
+ * This is a magic event we send when the q is too full.  Since it doesn't
+ * hold real event information we just keep one system wide and use it any time
+ * it is needed.  It's refcnt is set 1 at kernel init time and will never
+ * get set to 0 so it will never get 'freed'
+ */
+static struct fsnotify_event q_overflow_event;
+static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
+/**
+ * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
+ * Called from fsnotify_move, which is inlined into filesystem modules.
+ */
+u32 fsnotify_get_cookie(void)
+{
+        return atomic_inc_return(&fsnotify_sync_cookie);
+}
+EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
+/* return true if the notify queue is empty, false otherwise */
+bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        return list_empty(&group->notification_list) ? true : false;
+}
+void fsnotify_get_event(struct fsnotify_event *event)
+{
+        atomic_inc(&event->refcnt);
+}
+void fsnotify_put_event(struct fsnotify_event *event)
+{
+        if (!event)
+                return;
+        if (atomic_dec_and_test(&event->refcnt)) {
+                if (event->data_type == FSNOTIFY_EVENT_PATH)
+                        path_put(&event->path);
+                BUG_ON(!list_empty(&event->private_data_list));
+                kfree(event->file_name);
+                kmem_cache_free(fsnotify_event_cachep, event);
+        }
+}
+struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
+{
+        return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
+}
+void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
+{
+        kmem_cache_free(fsnotify_event_holder_cachep, holder);
+}
+/*
+ * Find the private data that the group previously attached to this event when
+ * the group added the event to the notification queue (fsnotify_add_notify_event)
+ */
+struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+        struct fsnotify_event_private_data *lpriv;
+        struct fsnotify_event_private_data *priv = NULL;
+        assert_spin_locked(&event->lock);
+        list_for_each_entry(lpriv, &event->private_data_list, event_list) {
+                if (lpriv->group == group) {
+                        priv = lpriv;
+                        list_del(&priv->event_list);
+                        break;
+                }
+        }
+        return priv;
+}
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+        if ((old->mask == new->mask) &&
+            (old->to_tell == new->to_tell) &&
+            (old->data_type == new->data_type)) {
+                switch (old->data_type) {
+                case (FSNOTIFY_EVENT_INODE):
+                        if (old->inode == new->inode)
+                                return true;
+                        break;
+                case (FSNOTIFY_EVENT_PATH):
+                        if ((old->path.mnt == new->path.mnt) &&
+                            (old->path.dentry == new->path.dentry))
+                                return true;
+                case (FSNOTIFY_EVENT_NONE):
+                        return true;
+                };
+        }
+        return false;
+}
+/*
+ * Add an event to the group notification queue.  The group can later pull this
+ * event off the queue to deal with.  If the event is successfully added to the
+ * group's notification queue, a reference is taken on event.
+ */
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+                              struct fsnotify_event_private_data *priv)
+{
+        struct fsnotify_event_holder *holder = NULL;
+        struct list_head *list = &group->notification_list;
+        struct fsnotify_event_holder *last_holder;
+        struct fsnotify_event *last_event;
+        /* easy to tell if priv was attached to the event */
+        INIT_LIST_HEAD(&priv->event_list);
+        /*
+         * There is one fsnotify_event_holder embedded inside each fsnotify_event.
+         * Check if we expect to be able to use that holder.  If not alloc a new
+         * holder.
+         * For the overflow event it's possible that something will use the in
+         * event holder before we get the lock so we may need to jump back and
+         * alloc a new holder, this can't happen for most events...
+         */
+        if (!list_empty(&event->holder.event_list)) {
+alloc_holder:
+                holder = fsnotify_alloc_event_holder();
+                if (!holder)
+                        return -ENOMEM;
+        }
+        mutex_lock(&group->notification_mutex);
+        if (group->q_len >= group->max_events) {
+                event = &q_overflow_event;
+                /* sorry, no private data on the overflow event */
+                priv = NULL;
+        }
+        spin_lock(&event->lock);
+        if (list_empty(&event->holder.event_list)) {
+                if (unlikely(holder))
+                        fsnotify_destroy_event_holder(holder);
+                holder = &event->holder;
+        } else if (unlikely(!holder)) {
+                /* between the time we checked above and got the lock the in
+                 * event holder was used, go back and get a new one */
+                spin_unlock(&event->lock);
+                mutex_unlock(&group->notification_mutex);
+                goto alloc_holder;
+        }
+        if (!list_empty(list)) {
+                last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+                last_event = last_holder->event;
+                if (event_compare(last_event, event)) {
+                        spin_unlock(&event->lock);
+                        mutex_unlock(&group->notification_mutex);
+                        if (holder != &event->holder)
+                                fsnotify_destroy_event_holder(holder);
+                        return -EEXIST;
+                }
+        }
+        group->q_len++;
+        holder->event = event;
+        fsnotify_get_event(event);
+        list_add_tail(&holder->event_list, list);
+        if (priv)
+                list_add_tail(&priv->event_list, &event->private_data_list);
+        spin_unlock(&event->lock);
+        mutex_unlock(&group->notification_mutex);
+        wake_up(&group->notification_waitq);
+        return 0;
+}
+/*
+ * Remove and return the first event from the notification list.  There is a
+ * reference held on this event since it was on the list.  It is the responsibility
+ * of the caller to drop this reference.
+ */
+struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_holder *holder;
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = holder->event;
+        spin_lock(&event->lock);
+        holder->event = NULL;
+        list_del_init(&holder->event_list);
+        spin_unlock(&event->lock);
+        /* event == holder means we are referenced through the in event holder */
+        if (holder != &event->holder)
+                fsnotify_destroy_event_holder(holder);
+        group->q_len--;
+        return event;
+}
+/*
+ * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ */
+struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_holder *holder;
+        BUG_ON(!mutex_is_locked(&group->notification_mutex));
+        holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+        event = holder->event;
+        return event;
+}
+/*
+ * Called when a group is being torn down to clean up any outstanding
+ * event notifications.
+ */
+void fsnotify_flush_notify(struct fsnotify_group *group)
+{
+        struct fsnotify_event *event;
+        struct fsnotify_event_private_data *priv;
+        mutex_lock(&group->notification_mutex);
+        while (!fsnotify_notify_queue_is_empty(group)) {
+                event = fsnotify_remove_notify_event(group);
+                /* if they don't implement free_event_priv they better not have attached any */
+                if (group->ops->free_event_priv) {
+                        spin_lock(&event->lock);
+                        priv = fsnotify_remove_priv_from_event(group, event);
+                        spin_unlock(&event->lock);
+                        if (priv)
+                                group->ops->free_event_priv(priv);
+                }
+                fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+        }
+        mutex_unlock(&group->notification_mutex);
+}
+static void initialize_event(struct fsnotify_event *event)
+{
+        event->holder.event = NULL;
+        INIT_LIST_HEAD(&event->holder.event_list);
+        atomic_set(&event->refcnt, 1);
+        spin_lock_init(&event->lock);
+        event->path.dentry = NULL;
+        event->path.mnt = NULL;
+        event->inode = NULL;
+        event->data_type = FSNOTIFY_EVENT_NONE;
+        INIT_LIST_HEAD(&event->private_data_list);
+        event->to_tell = NULL;
+        event->file_name = NULL;
+        event->name_len = 0;
+        event->sync_cookie = 0;
+}
+/*
+ * fsnotify_create_event - Allocate a new event which will be sent to each
+ * group's handle_event function if the group was interested in this
+ * particular event.
+ *
+ * @to_tell the inode which is supposed to receive the event (sometimes a
+ *      parent of the inode to which the event happened.
+ * @mask what actually happened.
+ * @data pointer to the object which was actually affected
+ * @data_type flag indication if the data is a file, path, inode, nothing...
+ * @name the filename, if available
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+                                             int data_type, const char *name, u32 cookie)
+{
+        struct fsnotify_event *event;
+        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+        if (!event)
+                return NULL;
+        initialize_event(event);
+        if (name) {
+                event->file_name = kstrdup(name, GFP_KERNEL);
+                if (!event->file_name) {
+                        kmem_cache_free(fsnotify_event_cachep, event);
+                        return NULL;
+                }
+                event->name_len = strlen(event->file_name);
+        }
+        event->sync_cookie = cookie;
+        event->to_tell = to_tell;
+        switch (data_type) {
+        case FSNOTIFY_EVENT_FILE: {
+                struct file *file = data;
+                struct path *path = &file->f_path;
+                event->path.dentry = path->dentry;
+                event->path.mnt = path->mnt;
+                path_get(&event->path);
+                event->data_type = FSNOTIFY_EVENT_PATH;
+                break;
+        }
+        case FSNOTIFY_EVENT_PATH: {
+                struct path *path = data;
+                event->path.dentry = path->dentry;
+                event->path.mnt = path->mnt;
+                path_get(&event->path);
+                event->data_type = FSNOTIFY_EVENT_PATH;
+                break;
+        }
+        case FSNOTIFY_EVENT_INODE:
+                event->inode = data;
+                event->data_type = FSNOTIFY_EVENT_INODE;
+                break;
+        case FSNOTIFY_EVENT_NONE:
+                event->inode = NULL;
+                event->path.dentry = NULL;
+                event->path.mnt = NULL;
+                break;
+        default:
+                BUG();
+        }
+        event->mask = mask;
+        return event;
+}
+__init int fsnotify_notification_init(void)
+{
+        fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+        fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
+        initialize_event(&q_overflow_event);
+        q_overflow_event.mask = FS_Q_OVERFLOW;
+        return 0;
+}
+subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951dcd4a6..abaaa1cbf8de 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>       /* For bdev_hardsect_size(). */
+#include <linux/blkdev.h>       /* For bdev_logical_block_size(). */
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
@@ -443,6 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        ntfs_volume *vol = NTFS_SB(sb);
        ntfs_debug("Entering with remount options string: %s", opt);
+        lock_kernel();
 #ifndef NTFS_RW
        /* For read-only compiled driver, enforce read-only flag. */
        *flags |= MS_RDONLY;
@@ -466,15 +468,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                if (NVolErrors(vol)) {
                        ntfs_error(sb, "Volume has errors and is read-only%s",
                                        es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_IS_DIRTY) {
                        ntfs_error(sb, "Volume is dirty and read-only%s", es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
                        ntfs_error(sb, "Volume has been modified by chkdsk "
                                        "and is read-only%s", es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -482,11 +487,13 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                                        "(0x%x) and is read-only%s",
                                        (unsigned)le16_to_cpu(vol->vol_flags),
                                        es);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
                        ntfs_error(sb, "Failed to set dirty bit in volume "
                                        "information flags%s", es);
+                        unlock_kernel();
                        return -EROFS;
                }
 #if 0
@@ -506,18 +513,21 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
                        ntfs_error(sb, "Failed to empty journal $LogFile%s",
                                        es);
                        NVolSetErrors(vol);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_mark_quotas_out_of_date(vol)) {
                        ntfs_error(sb, "Failed to mark quotas out of date%s",
                                        es);
                        NVolSetErrors(vol);
+                        unlock_kernel();
                        return -EROFS;
                }
                if (!ntfs_stamp_usnjrnl(vol)) {
                        ntfs_error(sb, "Failed to stamp transation log "
                                        "($UsnJrnl)%s", es);
                        NVolSetErrors(vol);
+                        unlock_kernel();
                        return -EROFS;
                }
        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -533,8 +543,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
        // TODO: Deal with *flags.
-        if (!parse_options(vol, opt))
+        if (!parse_options(vol, opt)) {
+                unlock_kernel();
                return -EINVAL;
+        }
+        unlock_kernel();
        ntfs_debug("Done.");
        return 0;
 }
@@ -2246,6 +2259,9 @@ static void ntfs_put_super(struct super_block *sb)
        ntfs_volume *vol = NTFS_SB(sb);
        ntfs_debug("Entering.");
+        lock_kernel();
 #ifdef NTFS_RW
        /*
         * Commit all inodes while they are still open in case some of them
@@ -2373,39 +2389,12 @@ static void ntfs_put_super(struct super_block *sb)
                vol->mftmirr_ino = NULL;
        }
        /*
-         * If any dirty inodes are left, throw away all mft data page cache
+         * We should have no dirty inodes left, due to
-         * pages to allow a clean umount.  This should never happen any more
+         * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
-         * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+         * the underlying mft records are written out and cleaned.
-         * the underlying mft records are written out and cleaned.  If it does,
-         * happen anyway, we want to know...
         */
        ntfs_commit_inode(vol->mft_ino);
        write_inode_now(vol->mft_ino, 1);
-        if (sb_has_dirty_inodes(sb)) {
-                const char *s1, *s2;
-                mutex_lock(&vol->mft_ino->i_mutex);
-                truncate_inode_pages(vol->mft_ino->i_mapping, 0);
-                mutex_unlock(&vol->mft_ino->i_mutex);
-                write_inode_now(vol->mft_ino, 1);
-                if (sb_has_dirty_inodes(sb)) {
-                        static const char *_s1 = "inodes";
-                        static const char *_s2 = "";
-                        s1 = _s1;
-                        s2 = _s2;
-                } else {
-                        static const char *_s1 = "mft pages";
-                        static const char *_s2 = "They have been thrown "
-                                        "away.  ";
-                        s1 = _s1;
-                        s2 = _s2;
-                }
-                ntfs_error(sb, "Dirty %s found at umount time.  %sYou should "
-                                "run chkdsk.  Please email "
-                                "linux-ntfs-dev@lists.sourceforge.net and say "
-                                "that you saw this message.  Thank you.", s1,
-                                s2);
-        }
 #endif /* NTFS_RW */
        iput(vol->mft_ino);
@@ -2444,7 +2433,8 @@ static void ntfs_put_super(struct super_block *sb)
        }
        sb->s_fs_info = NULL;
        kfree(vol);
-        return;
+        unlock_kernel();
 }
 /**
@@ -2785,13 +2775,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
                goto err_out_now;
        /* We support sector sizes up to the PAGE_CACHE_SIZE. */
-        if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+        if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
                if (!silent)
                        ntfs_error(sb, "Device has unsupported sector size "
                                        "(%i).  The maximum supported sector "
                                        "size on this architecture is %lu "
                                        "bytes.",
-                                        bdev_hardsect_size(sb->s_bdev),
+                                        bdev_logical_block_size(sb->s_bdev),
                                        PAGE_CACHE_SIZE);
                goto err_out_now;
        }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85eceab376..09cc25d04611 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
        bdevname(reg->hr_bdev, reg->hr_dev_name);
-        sectsize = bdev_hardsect_size(reg->hr_bdev);
+        sectsize = bdev_logical_block_size(reg->hr_bdev);
        if (sectsize != reg->hr_block_bytes) {
                mlog(ML_ERROR,
                     "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9d37e0..201b40a441fe 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/smp_lock.h>
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -126,7 +127,6 @@ static int ocfs2_get_sector(struct super_block *sb,
                            struct buffer_head **bh,
                            int block,
                            int sect_size);
-static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
@@ -141,7 +141,6 @@ static const struct super_operations ocfs2_sops = {
        .clear_inode    = ocfs2_clear_inode,
        .delete_inode   = ocfs2_delete_inode,
        .sync_fs        = ocfs2_sync_fs,
-        .write_super    = ocfs2_write_super,
        .put_super      = ocfs2_put_super,
        .remount_fs     = ocfs2_remount,
        .show_options   = ocfs2_show_options,
@@ -365,24 +364,12 @@ static struct file_operations ocfs2_osb_debug_fops = {
        .llseek =       generic_file_llseek,
 };
-/*
- * write_super and sync_fs ripped right out of ext3.
- */
-static void ocfs2_write_super(struct super_block *sb)
-{
-        if (mutex_trylock(&sb->s_lock) != 0)
-                BUG();
-        sb->s_dirt = 0;
-}
 static int ocfs2_sync_fs(struct super_block *sb, int wait)
 {
        int status;
        tid_t target;
        struct ocfs2_super *osb = OCFS2_SB(sb);
-        sb->s_dirt = 0;
        if (ocfs2_is_hard_readonly(osb))
                return -EROFS;
@@ -595,6 +582,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        struct mount_options parsed_options;
        struct ocfs2_super *osb = OCFS2_SB(sb);
+        lock_kernel();
        if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
                ret = -EINVAL;
                goto out;
@@ -698,6 +687,7 @@ unlock_osb:
                        ocfs2_set_journal_params(osb);
        }
 out:
+        unlock_kernel();
        return ret;
 }
@@ -713,7 +703,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
        *bh = NULL;
        /* may be > 512 */
-        *sector_size = bdev_hardsect_size(sb->s_bdev);
+        *sector_size = bdev_logical_block_size(sb->s_bdev);
        if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
                mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
                     *sector_size, OCFS2_MAX_BLOCKSIZE);
@@ -1550,9 +1540,13 @@ static void ocfs2_put_super(struct super_block *sb)
 {
        mlog_entry("(0x%p)\n", sb);
+        lock_kernel();
        ocfs2_sync_blockdev(sb);
        ocfs2_dismount_volume(sb, 0);
+        unlock_kernel();
        mlog_exit_void();
 }
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 834b2331f6b3..d17e774eaf45 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -11,21 +11,6 @@
 #include <linux/mpage.h>
 #include "omfs.h"
-static int omfs_sync_file(struct file *file, struct dentry *dentry,
-                int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        err |= omfs_sync_inode(inode);
-        return err ? -EIO : 0;
-}
 static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
 {
        return (sbi->s_sys_blocksize - offset -
@@ -344,7 +329,7 @@ struct file_operations omfs_file_operations = {
        .aio_read = generic_file_aio_read,
        .aio_write = generic_file_aio_write,
        .mmap = generic_file_mmap,
-        .fsync = omfs_sync_file,
+        .fsync = simple_fsync,
        .splice_read = generic_file_splice_read,
 };
diff --git a/fs/open.c b/fs/open.c
index bdfbf03615a4..7200e23d9258 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
        audit_inode(NULL, dentry);
-        err = mnt_want_write(file->f_path.mnt);
+        err = mnt_want_write_file(file);
        if (err)
                goto out_putf;
        mutex_lock(&inode->i_mutex);
@@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
        if (!file)
                goto out;
-        error = mnt_want_write(file->f_path.mnt);
+        error = mnt_want_write_file(file);
        if (error)
                goto out_fput;
        dentry = file->f_path.dentry;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 99e33ef40be4..1a9c7878f864 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
        return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
+ssize_t part_alignment_offset_show(struct device *dev,
+                                   struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
 ssize_t part_stat_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
 {
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_partition.attr,
        &dev_attr_start.attr,
        &dev_attr_size.attr,
+        &dev_attr_alignment_offset.attr,
        &dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        pdev = part_to_dev(p);
        p->start_sect = start;
+        p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
@@ -546,27 +556,49 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
        /* add partitions */
        for (p = 1; p < state->limit; p++) {
-                sector_t size = state->parts[p].size;
+                sector_t size, from;
-                sector_t from = state->parts[p].from;
+try_scan:
+                size = state->parts[p].size;
                if (!size)
                        continue;
+                from = state->parts[p].from;
                if (from >= get_capacity(disk)) {
                        printk(KERN_WARNING
                               "%s: p%d ignored, start %llu is behind the end of the disk\n",
                               disk->disk_name, p, (unsigned long long) from);
                        continue;
                }
                if (from + size > get_capacity(disk)) {
-                        /*
+                        struct block_device_operations *bdops = disk->fops;
-                         * we can not ignore partitions of broken tables
+                        unsigned long long capacity;
-                         * created by for example camera firmware, but we
-                         * limit them to the end of the disk to avoid
-                         * creating invalid block devices
-                         */
                        printk(KERN_WARNING
-                               "%s: p%d size %llu limited to end of disk\n",
+                               "%s: p%d size %llu exceeds device capacity, ",
                               disk->disk_name, p, (unsigned long long) size);
-                        size = get_capacity(disk) - from;
+                        if (bdops->set_capacity &&
+                            (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
+                                printk(KERN_CONT "enabling native capacity\n");
+                                capacity = bdops->set_capacity(disk, ~0ULL);
+                                disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+                                if (capacity > get_capacity(disk)) {
+                                        set_capacity(disk, capacity);
+                                        check_disk_size_change(disk, bdev);
+                                        bdev->bd_invalidated = 0;
+                                }
+                                goto try_scan;
+                        } else {
+                                /*
+                                 * we can not ignore partitions of broken tables
+                                 * created by for example camera firmware, but
+                                 * we limit them to the end of the disk to avoid
+                                 * creating invalid block devices
+                                 */
+                                printk(KERN_CONT "limited to end of disk\n");
+                                size = get_capacity(disk) - from;
+                        }
                }
                part = add_partition(disk, p, from, size,
                                     state->parts[p].flags);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 46297683cd34..fc71aab08460 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
        Sector sect;
        res = 0;
-        blocksize = bdev_hardsect_size(bdev);
+        blocksize = bdev_logical_block_size(bdev);
        if (blocksize <= 0)
                goto out_exit;
        i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 796511886f28..0028d2ef0662 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
        Sector sect;
        unsigned char *data;
        u32 this_sector, this_size;
-        int sector_size = bdev_hardsect_size(bdev) / 512;
+        int sector_size = bdev_logical_block_size(bdev) / 512;
        int loopct = 0;         /* number of links followed
                                   without finding a data partition */
        int i;
@@ -415,7 +415,7 @@ static struct {
 
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-        int sector_size = bdev_hardsect_size(bdev) / 512;
+        int sector_size = bdev_logical_block_size(bdev) / 512;
        Sector sect;
        unsigned char *data;
        struct partition *p;
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec45b8d..f7dd21ad85a6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
        return 0;
 }
+/**
+ * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
+ * @pipe:       the pipe that the buffer belongs to
+ * @buf:        the buffer to put a reference to
+ *
+ * Description:
+ *      This function releases a reference to @buf.
+ */
+void generic_pipe_buf_release(struct pipe_inode_info *pipe,
+                              struct pipe_buffer *buf)
+{
+        page_cache_release(buf->page);
+}
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
        .map = generic_pipe_buf_map,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3326bbf9ab95..1539e630c47d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2128,9 +2128,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
        if (copy_from_user(page, buf, count))
                goto out_free;
+        /* Guard against adverse ptrace interaction */
+        length = mutex_lock_interruptible(&task->cred_guard_mutex);
+        if (length < 0)
+                goto out_free;
        length = security_setprocattr(task,
                                      (char*)file->f_path.dentry->d_name.name,
                                      (void*)page, count);
+        mutex_unlock(&task->cred_guard_mutex);
 out_free:
        free_page((unsigned long) page);
 out:
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f6db9618a888..753ca37002c8 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,3 +92,28 @@ struct pde_opener {
        struct list_head lh;
 };
 void pde_users_dec(struct proc_dir_entry *pde);
+extern spinlock_t proc_subdir_lock;
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+unsigned long task_vsize(struct mm_struct *);
+int task_statm(struct mm_struct *, int *, int *, int *, int *);
+void task_mem(struct seq_file *, struct mm_struct *);
+struct proc_dir_entry *de_get(struct proc_dir_entry *de);
+void de_put(struct proc_dir_entry *de);
+extern struct vfsmount *proc_mnt;
+int proc_fill_super(struct super_block *);
+struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+/*
+ * These are generic /proc routines that use the internal
+ * "struct proc_dir_entry" tree to traverse the filesystem.
+ *
+ * The /proc root directory has extended versions to take care
+ * of the /proc/<pid> subdirectories.
+ */
+int proc_readdir(struct file *, void *, filldir_t);
+struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39cf99ee..1afa4dd4cae2 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
-        int a, b, c;
+        unsigned long avnrun[3];
-        unsigned long seq;
-        do {
+        get_avenrun(avnrun, FIXED_1/200, 0);
-                seq = read_seqbegin(&xtime_lock);
-                a = avenrun[0] + (FIXED_1/200);
-                b = avenrun[1] + (FIXED_1/200);
-                c = avenrun[2] + (FIXED_1/200);
-        } while (read_seqretry(&xtime_lock, seq));
-        seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+        seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
-                LOAD_INT(a), LOAD_FRAC(a),
+                LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
-                LOAD_INT(b), LOAD_FRAC(b),
+                LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
-                LOAD_INT(c), LOAD_FRAC(c),
+                LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
                nr_running(), nr_threads,
                task_active_pid_ns(current)->last_pid);
        return 0;
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de2bba5a3440..fc6c3025befd 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 #ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index 502d7fe98bab..e4d408cc5473 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
 obj-$(CONFIG_QNX4FS_FS) += qnx4.o
-qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o
+qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 8425cf6e9624..e1cd061a25f7 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,14 +13,9 @@
 * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
 */
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include "qnx4.h"
 #if 0
 int qnx4_new_block(struct super_block *sb)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ea9ffefb48ad..003c68f3238b 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,14 +11,9 @@
 * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
 */
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include "qnx4.h"
 static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
@@ -84,7 +79,7 @@ const struct file_operations qnx4_dir_operations =
 {
        .read           = generic_read_dir,
        .readdir        = qnx4_readdir,
-        .fsync          = file_fsync,
+        .fsync          = simple_fsync,
 };
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 867f42b02035..09b170ac936c 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -12,8 +12,7 @@
 * 27-06-1998 by Frank Denis : file overwriting.
 */
-#include <linux/fs.h>
+#include "qnx4.h"
-#include <linux/qnx4_fs.h>
 /*
 * We have mostly NULL's here: the current defaults are ok for
@@ -29,7 +28,7 @@ const struct file_operations qnx4_file_operations =
 #ifdef CONFIG_QNX4FS_RW
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
-        .fsync          = qnx4_sync_file,
+        .fsync          = simple_fsync,
 #endif
 };
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
deleted file mode 100644
index aa3b19544bee..000000000000
--- a/fs/qnx4/fsync.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* 
- * QNX4 file system, Linux implementation.
- * 
- * Version : 0.1
- * 
- * Using parts of the xiafs filesystem.
- * 
- * History :
- * 
- * 24-03-1998 by Richard Frowijn : first release.
- */
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <asm/system.h>
-/*
- * The functions for qnx4 fs file synchronization.
- */
-#ifdef CONFIG_QNX4FS_RW
-static int sync_block(struct inode *inode, unsigned short *block, int wait)
-{
-        struct buffer_head *bh;
-        unsigned short tmp;
-        if (!*block)
-                return 0;
-        tmp = *block;
-        bh = sb_find_get_block(inode->i_sb, *block);
-        if (!bh)
-                return 0;
-        if (*block != tmp) {
-                brelse(bh);
-                return 1;
-        }
-        if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
-                brelse(bh);
-                return -1;
-        }
-        if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
-                brelse(bh);
-                return 0;
-        }
-        ll_rw_block(WRITE, 1, &bh);
-        atomic_dec(&bh->b_count);
-        return 0;
-}
-#ifdef WTF
-static int sync_iblock(struct inode *inode, unsigned short *iblock,
-                       struct buffer_head **bh, int wait)
-{
-        int rc;
-        unsigned short tmp;
-        *bh = NULL;
-        tmp = *iblock;
-        if (!tmp)
-                return 0;
-        rc = sync_block(inode, iblock, wait);
-        if (rc)
-                return rc;
-        *bh = sb_bread(inode->i_sb, tmp);
-        if (tmp != *iblock) {
-                brelse(*bh);
-                *bh = NULL;
-                return 1;
-        }
-        if (!*bh)
-                return -1;
-        return 0;
-}
-#endif
-static int sync_direct(struct inode *inode, int wait)
-{
-        int i;
-        int rc, err = 0;
-        for (i = 0; i < 7; i++) {
-                rc = sync_block(inode,
-                                (unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait);
-                if (rc > 0)
-                        break;
-                if (rc)
-                        err = rc;
-        }
-        return err;
-}
-#ifdef WTF
-static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait)
-{
-        int i;
-        struct buffer_head *ind_bh;
-        int rc, err = 0;
-        rc = sync_iblock(inode, iblock, &ind_bh, wait);
-        if (rc || !ind_bh)
-                return rc;
-        for (i = 0; i < 512; i++) {
-                rc = sync_block(inode,
-                                ((unsigned short *) ind_bh->b_data) + i,
-                                wait);
-                if (rc > 0)
-                        break;
-                if (rc)
-                        err = rc;
-        }
-        brelse(ind_bh);
-        return err;
-}
-static int sync_dindirect(struct inode *inode, unsigned short *diblock,
-                          int wait)
-{
-        int i;
-        struct buffer_head *dind_bh;
-        int rc, err = 0;
-        rc = sync_iblock(inode, diblock, &dind_bh, wait);
-        if (rc || !dind_bh)
-                return rc;
-        for (i = 0; i < 512; i++) {
-                rc = sync_indirect(inode,
-                                ((unsigned short *) dind_bh->b_data) + i,
-                                   wait);
-                if (rc > 0)
-                        break;
-                if (rc)
-                        err = rc;
-        }
-        brelse(dind_bh);
-        return err;
-}
-#endif
-int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused)
-{
-        struct inode *inode = dentry->d_inode;
-        int wait, err = 0;
-        
-        (void) file;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-              S_ISLNK(inode->i_mode)))
-                return -EINVAL;
-        lock_kernel();
-        for (wait = 0; wait <= 1; wait++) {
-                err |= sync_direct(inode, wait);
-        }
-        err |= qnx4_sync_inode(inode);
-        unlock_kernel();
-        return (err < 0) ? -EIO : 0;
-}
-#endif
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fe1f0f31d11c..681df5fcd161 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -13,19 +13,15 @@
 */
 #include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
-#include <linux/vfs.h>
+#include <linux/writeback.h>
-#include <asm/uaccess.h>
+#include <linux/statfs.h>
+#include "qnx4.h"
 #define QNX4_VERSION  4
 #define QNX4_BMNAME   ".bitmap"
@@ -34,31 +30,6 @@ static const struct super_operations qnx4_sops;
 #ifdef CONFIG_QNX4FS_RW
-int qnx4_sync_inode(struct inode *inode)
-{
-        int err = 0;
-# if 0
-        struct buffer_head *bh;
-        bh = qnx4_update_inode(inode);
-        if (bh && buffer_dirty(bh))
-        {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh))
-                {
-                        printk ("IO error syncing qnx4 inode [%s:%08lx]\n",
-                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
-                }
-                brelse (bh);
-        } else if (!bh) {
-                err = -1;
-        }
-# endif
-        return err;
-}
 static void qnx4_delete_inode(struct inode *inode)
 {
        QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
@@ -70,15 +41,7 @@ static void qnx4_delete_inode(struct inode *inode)
        unlock_kernel();
 }
-static void qnx4_write_super(struct super_block *sb)
+static int qnx4_write_inode(struct inode *inode, int do_sync)
-{
-        lock_kernel();
-        QNX4DEBUG(("qnx4: write_super\n"));
-        sb->s_dirt = 0;
-        unlock_kernel();
-}
-static int qnx4_write_inode(struct inode *inode, int unused)
 {
        struct qnx4_inode_entry *raw_inode;
        int block, ino;
@@ -115,6 +78,16 @@ static int qnx4_write_inode(struct inode *inode, int unused)
        raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
        raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
        mark_buffer_dirty(bh);
+        if (do_sync) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        printk("qnx4: IO error syncing inode [%s:%08x]\n",
+                                        inode->i_sb->s_id, ino);
+                        brelse(bh);
+                        unlock_kernel();
+                        return -EIO;
+                }
+        }
        brelse(bh);
        unlock_kernel();
        return 0;
@@ -138,7 +111,6 @@ static const struct super_operations qnx4_sops =
 #ifdef CONFIG_QNX4FS_RW
        .write_inode    = qnx4_write_inode,
        .delete_inode   = qnx4_delete_inode,
-        .write_super    = qnx4_write_super,
 #endif
 };
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 775eed3a4085..5972ed214937 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,16 +12,9 @@
 * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
 */
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include "qnx4.h"
 /*
@@ -187,7 +180,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
        de->di_status = 0;
        memset(de->di_fname, 0, sizeof de->di_fname);
        de->di_mode = 0;
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        clear_nlink(inode);
        mark_inode_dirty(inode);
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -232,7 +225,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
        de->di_status = 0;
        memset(de->di_fname, 0, sizeof de->di_fname);
        de->di_mode = 0;
-        mark_buffer_dirty(bh);
+        mark_buffer_dirty_inode(bh, dir);
        dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
        mark_inode_dirty(dir);
        inode->i_ctime = dir->i_ctime;
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
new file mode 100644
index 000000000000..9efc089454f6
--- /dev/null
+++ b/fs/qnx4/qnx4.h
@@ -0,0 +1,57 @@
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+#define QNX4_DEBUG 0
+#if QNX4_DEBUG
+#define QNX4DEBUG(X) printk X
+#else
+#define QNX4DEBUG(X) (void) 0
+#endif
+struct qnx4_sb_info {
+        struct buffer_head      *sb_buf;        /* superblock buffer */
+        struct qnx4_super_block *sb;            /* our superblock */
+        unsigned int            Version;        /* may be useful */
+        struct qnx4_inode_entry *BitMap;        /* useful */
+};
+struct qnx4_inode_info {
+        struct qnx4_inode_entry raw;
+        loff_t mmu_private;
+        struct inode vfs_inode;
+};
+extern struct inode *qnx4_iget(struct super_block *, unsigned long);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
+extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
+extern struct buffer_head *qnx4_bread(struct inode *, int, int);
+extern const struct inode_operations qnx4_file_inode_operations;
+extern const struct inode_operations qnx4_dir_inode_operations;
+extern const struct file_operations qnx4_file_operations;
+extern const struct file_operations qnx4_dir_operations;
+extern int qnx4_is_free(struct super_block *sb, long block);
+extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
+extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
+extern void qnx4_truncate(struct inode *inode);
+extern void qnx4_free_inode(struct inode *inode);
+extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
+extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
+static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
+{
+        return sb->s_fs_info;
+}
+static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
+{
+        return container_of(inode, struct qnx4_inode_info, vfs_inode);
+}
+static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
+{
+        return &qnx4_i(inode)->raw;
+}
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 6437c1c3d1dd..d94d9ee241fe 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,12 +10,8 @@
 * 30-06-1998 by Frank DENIS : ugly filler.
 */
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/smp_lock.h>
-#include <asm/uaccess.h>
+#include "qnx4.h"
 #ifdef CONFIG_QNX4FS_RW
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b7f5a468f076..95c5b42384b2 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -159,10 +159,14 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
        return error;
 }
-static void quota_sync_sb(struct super_block *sb, int type)
+#ifdef CONFIG_QUOTA
+void sync_quota_sb(struct super_block *sb, int type)
 {
        int cnt;
+        if (!sb->s_qcop->quota_sync)
+                return;
        sb->s_qcop->quota_sync(sb, type);
        if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
@@ -191,17 +195,13 @@ static void quota_sync_sb(struct super_block *sb, int type)
        }
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 }
+#endif
-void sync_dquots(struct super_block *sb, int type)
+static void sync_dquots(int type)
 {
+        struct super_block *sb;
        int cnt;
-        if (sb) {
-                if (sb->s_qcop->quota_sync)
-                        quota_sync_sb(sb, type);
-                return;
-        }
        spin_lock(&sb_lock);
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
@@ -222,8 +222,8 @@ restart:
                sb->s_count++;
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
-                if (sb->s_root && sb->s_qcop->quota_sync)
+                if (sb->s_root)
-                        quota_sync_sb(sb, type);
+                        sync_quota_sb(sb, type);
                up_read(&sb->s_umount);
                spin_lock(&sb_lock);
                if (__put_super_and_need_restart(sb))
@@ -301,7 +301,10 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                        return sb->s_qcop->set_dqblk(sb, type, id, &idq);
                }
                case Q_SYNC:
-                        sync_dquots(sb, type);
+                        if (sb)
+                                sync_quota_sb(sb, type);
+                        else
+                                sync_dquots(type);
                        return 0;
                case Q_XQUOTAON:
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 3a6b193d8444..0ff7566c767c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -202,9 +202,12 @@ static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
                                return -EINVAL;
                        opts->mode = option & S_IALLUGO;
                        break;
-                default:
+                /*
-                        printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
+                 * We might like to report bad mount options here;
-                        return -EINVAL;
+                 * but traditionally ramfs has ignored all mount options,
+                 * and as it is used as a !CONFIG_SHMEM simple substitute
+                 * for tmpfs, better continue to ignore other mount options.
+                 */
                }
        }
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76bb9ee1..6c8c55dec2bc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                goto out;
        if (!(in_file->f_mode & FMODE_READ))
                goto fput_in;
-        retval = -EINVAL;
-        in_inode = in_file->f_path.dentry->d_inode;
-        if (!in_inode)
-                goto fput_in;
-        if (!in_file->f_op || !in_file->f_op->splice_read)
-                goto fput_in;
        retval = -ESPIPE;
        if (!ppos)
                ppos = &in_file->f_pos;
@@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
        retval = -EINVAL;
        if (!out_file->f_op || !out_file->f_op->sendpage)
                goto fput_out;
+        in_inode = in_file->f_path.dentry->d_inode;
        out_inode = out_file->f_path.dentry->d_inode;
        retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
        if (retval < 0)
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 45ee3d357c70..6d2668fdc384 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -44,13 +44,11 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 static inline bool is_privroot_deh(struct dentry *dir,
                                   struct reiserfs_de_head *deh)
 {
-        int ret = 0;
-#ifdef CONFIG_REISERFS_FS_XATTR
        struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-        ret = (dir == dir->d_parent && privroot->d_inode &&
+        if (reiserfs_expose_privroot(dir->d_sb))
-               deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
+                return 0;
-#endif
+        return (dir == dir->d_parent && privroot->d_inode &&
-        return ret;
+                deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3567fb9e3fb1..2969773cfc22 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,6 +28,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 struct file_system_type reiserfs_fs_type;
@@ -64,18 +65,15 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
-        if (!(s->s_flags & MS_RDONLY)) {
+        struct reiserfs_transaction_handle th;
-                struct reiserfs_transaction_handle th;
-                reiserfs_write_lock(s);
+        reiserfs_write_lock(s);
-                if (!journal_begin(&th, s, 1))
+        if (!journal_begin(&th, s, 1))
-                        if (!journal_end_sync(&th, s, 1))
+                if (!journal_end_sync(&th, s, 1))
-                                reiserfs_flush_old_commits(s);
+                        reiserfs_flush_old_commits(s);
-                s->s_dirt = 0;  /* Even if it's not true.
+        s->s_dirt = 0;  /* Even if it's not true.
-                                 * We'll loop forever in sync_supers otherwise */
+                         * We'll loop forever in sync_supers otherwise */
-                reiserfs_write_unlock(s);
+        reiserfs_write_unlock(s);
-        } else {
-                s->s_dirt = 0;
-        }
        return 0;
 }
@@ -468,6 +466,11 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
+        lock_kernel();
+        if (s->s_dirt)
+                reiserfs_write_super(s);
        /* change file system state to current state if it was mounted with read-write permissions */
        if (!(s->s_flags & MS_RDONLY)) {
                if (!journal_begin(&th, s, 10)) {
@@ -500,7 +503,7 @@ static void reiserfs_put_super(struct super_block *s)
        kfree(s->s_fs_info);
        s->s_fs_info = NULL;
-        return;
+        unlock_kernel();
 }
 static struct kmem_cache *reiserfs_inode_cachep;
@@ -898,6 +901,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
                {"conv",.setmask = 1 << REISERFS_CONVERT},
                {"attrs",.setmask = 1 << REISERFS_ATTRS},
                {"noattrs",.clrmask = 1 << REISERFS_ATTRS},
+                {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
 #ifdef CONFIG_REISERFS_FS_XATTR
                {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
                {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
@@ -1193,6 +1197,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
+        lock_kernel();
        rs = SB_DISK_SUPER_BLOCK(s);
        if (!reiserfs_parse_options
@@ -1315,10 +1320,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 out_ok:
        replace_mount_options(s, new_opts);
+        unlock_kernel();
        return 0;
 out_err:
        kfree(new_opts);
+        unlock_kernel();
        return err;
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8e7deb0e6964..f3d47d856848 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -981,7 +981,8 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                s->s_root->d_op = &xattr_lookup_poison_ops;
+                if (!reiserfs_expose_privroot(s))
+                        s->s_root->d_op = &xattr_lookup_poison_ops;
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fc27fbfc5397..1402d2d54f52 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -474,6 +474,8 @@ smb_put_super(struct super_block *sb)
 {
        struct smb_sb_info *server = SMB_SB(sb);
+        lock_kernel();
        smb_lock_server(server);
        server->state = CONN_INVALID;
        smbiod_unregister_server(server);
@@ -489,6 +491,8 @@ smb_put_super(struct super_block *sb)
        smb_unlock_server(server);
        put_pid(server->conn_pid);
        kfree(server);
+        unlock_kernel();
 }
 static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
diff --git a/fs/splice.c b/fs/splice.c
index 666953d59a35..73766d24f97b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -507,9 +507,131 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
        return ret;
 }
 EXPORT_SYMBOL(generic_file_splice_read);
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+        .can_merge = 0,
+        .map = generic_pipe_buf_map,
+        .unmap = generic_pipe_buf_unmap,
+        .confirm = generic_pipe_buf_confirm,
+        .release = generic_pipe_buf_release,
+        .steal = generic_pipe_buf_steal,
+        .get = generic_pipe_buf_get,
+};
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+                            unsigned long vlen, loff_t offset)
+{
+        mm_segment_t old_fs;
+        loff_t pos = offset;
+        ssize_t res;
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* The cast to a user pointer is valid due to the set_fs() */
+        res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+        set_fs(old_fs);
+        return res;
+}
+static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+                            loff_t pos)
+{
+        mm_segment_t old_fs;
+        ssize_t res;
+        old_fs = get_fs();
+        set_fs(get_ds());
+        /* The cast to a user pointer is valid due to the set_fs() */
+        res = vfs_write(file, (const char __user *)buf, count, &pos);
+        set_fs(old_fs);
+        return res;
+}
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+                                 struct pipe_inode_info *pipe, size_t len,
+                                 unsigned int flags)
+{
+        unsigned int nr_pages;
+        unsigned int nr_freed;
+        size_t offset;
+        struct page *pages[PIPE_BUFFERS];
+        struct partial_page partial[PIPE_BUFFERS];
+        struct iovec vec[PIPE_BUFFERS];
+        pgoff_t index;
+        ssize_t res;
+        size_t this_len;
+        int error;
+        int i;
+        struct splice_pipe_desc spd = {
+                .pages = pages,
+                .partial = partial,
+                .flags = flags,
+                .ops = &default_pipe_buf_ops,
+                .spd_release = spd_release_page,
+        };
+        index = *ppos >> PAGE_CACHE_SHIFT;
+        offset = *ppos & ~PAGE_CACHE_MASK;
+        nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+                struct page *page;
+                page = alloc_page(GFP_USER);
+                error = -ENOMEM;
+                if (!page)
+                        goto err;
+                this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+                vec[i].iov_base = (void __user *) page_address(page);
+                vec[i].iov_len = this_len;
+                pages[i] = page;
+                spd.nr_pages++;
+                len -= this_len;
+                offset = 0;
+        }
+        res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+        if (res < 0) {
+                error = res;
+                goto err;
+        }
+        error = 0;
+        if (!res)
+                goto err;
+        nr_freed = 0;
+        for (i = 0; i < spd.nr_pages; i++) {
+                this_len = min_t(size_t, vec[i].iov_len, res);
+                partial[i].offset = 0;
+                partial[i].len = this_len;
+                if (!this_len) {
+                        __free_page(pages[i]);
+                        pages[i] = NULL;
+                        nr_freed++;
+                }
+                res -= this_len;
+        }
+        spd.nr_pages -= nr_freed;
+        res = splice_to_pipe(pipe, &spd);
+        if (res > 0)
+                *ppos += res;
+        return res;
+err:
+        for (i = 0; i < spd.nr_pages; i++)
+                __free_page(pages[i]);
+        return error;
+}
+EXPORT_SYMBOL(default_file_splice_read);
 /*
 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
 * using sendpage(). Return the number of bytes sent.
@@ -881,6 +1003,36 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 EXPORT_SYMBOL(generic_file_splice_write);
+static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+                          struct splice_desc *sd)
+{
+        int ret;
+        void *data;
+        ret = buf->ops->confirm(pipe, buf);
+        if (ret)
+                return ret;
+        data = buf->ops->map(pipe, buf, 0);
+        ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+        buf->ops->unmap(pipe, buf, data);
+        return ret;
+}
+static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
+                                         struct file *out, loff_t *ppos,
+                                         size_t len, unsigned int flags)
+{
+        ssize_t ret;
+        ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
+        if (ret > 0)
+                *ppos += ret;
+        return ret;
+}
 /**
 * generic_splice_sendpage - splice data from a pipe to a socket
 * @pipe:       pipe to splice from
@@ -908,11 +1060,10 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
                           loff_t *ppos, size_t len, unsigned int flags)
 {
+        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+                                loff_t *, size_t, unsigned int);
        int ret;
-        if (unlikely(!out->f_op || !out->f_op->splice_write))
-                return -EINVAL;
        if (unlikely(!(out->f_mode & FMODE_WRITE)))
                return -EBADF;
@@ -923,7 +1074,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
        if (unlikely(ret < 0))
                return ret;
-        return out->f_op->splice_write(pipe, out, ppos, len, flags);
+        splice_write = out->f_op->splice_write;
+        if (!splice_write)
+                splice_write = default_file_splice_write;
+        return splice_write(pipe, out, ppos, len, flags);
 }
 /*
@@ -933,11 +1088,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe, size_t len,
                         unsigned int flags)
 {
+        ssize_t (*splice_read)(struct file *, loff_t *,
+                               struct pipe_inode_info *, size_t, unsigned int);
        int ret;
-        if (unlikely(!in->f_op || !in->f_op->splice_read))
-                return -EINVAL;
        if (unlikely(!(in->f_mode & FMODE_READ)))
                return -EBADF;
@@ -945,7 +1099,11 @@ static long do_splice_to(struct file *in, loff_t *ppos,
        if (unlikely(ret < 0))
                return ret;
-        return in->f_op->splice_read(in, ppos, pipe, len, flags);
+        splice_read = in->f_op->splice_read;
+        if (!splice_read)
+                splice_read = default_file_splice_read;
+        return splice_read(in, ppos, pipe, len, flags);
 }
 /**
@@ -1112,6 +1270,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
        return ret;
 }
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+                               struct pipe_inode_info *opipe,
+                               size_t len, unsigned int flags);
 /*
 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
 * location, so checking ->i_pipe is not enough to verify that this is a
@@ -1132,12 +1293,32 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                      struct file *out, loff_t __user *off_out,
                      size_t len, unsigned int flags)
 {
-        struct pipe_inode_info *pipe;
+        struct pipe_inode_info *ipipe;
+        struct pipe_inode_info *opipe;
        loff_t offset, *off;
        long ret;
-        pipe = pipe_info(in->f_path.dentry->d_inode);
+        ipipe = pipe_info(in->f_path.dentry->d_inode);
-        if (pipe) {
+        opipe = pipe_info(out->f_path.dentry->d_inode);
+        if (ipipe && opipe) {
+                if (off_in || off_out)
+                        return -ESPIPE;
+                if (!(in->f_mode & FMODE_READ))
+                        return -EBADF;
+                if (!(out->f_mode & FMODE_WRITE))
+                        return -EBADF;
+                /* Splicing to self would be fun, but... */
+                if (ipipe == opipe)
+                        return -EINVAL;
+                return splice_pipe_to_pipe(ipipe, opipe, len, flags);
+        }
+        if (ipipe) {
                if (off_in)
                        return -ESPIPE;
                if (off_out) {
@@ -1149,7 +1330,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                } else
                        off = &out->f_pos;
-                ret = do_splice_from(pipe, out, off, len, flags);
+                ret = do_splice_from(ipipe, out, off, len, flags);
                if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
                        ret = -EFAULT;
@@ -1157,8 +1338,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                return ret;
        }
-        pipe = pipe_info(out->f_path.dentry->d_inode);
+        if (opipe) {
-        if (pipe) {
                if (off_out)
                        return -ESPIPE;
                if (off_in) {
@@ -1170,7 +1350,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
                } else
                        off = &in->f_pos;
-                ret = do_splice_to(in, off, pipe, len, flags);
+                ret = do_splice_to(in, off, opipe, len, flags);
                if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
                        ret = -EFAULT;
@@ -1511,7 +1691,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 * Make sure there's data to read. Wait for input if we can, otherwise
 * return an appropriate error.
 */
-static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
        int ret;
@@ -1549,7 +1729,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 * Make sure there's writeable room. Wait for room if we can, otherwise
 * return an appropriate error.
 */
-static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
        int ret;
@@ -1587,6 +1767,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 }
 /*
+ * Splice contents of ipipe to opipe.
+ */
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+                               struct pipe_inode_info *opipe,
+                               size_t len, unsigned int flags)
+{
+        struct pipe_buffer *ibuf, *obuf;
+        int ret = 0, nbuf;
+        bool input_wakeup = false;
+retry:
+        ret = ipipe_prep(ipipe, flags);
+        if (ret)
+                return ret;
+        ret = opipe_prep(opipe, flags);
+        if (ret)
+                return ret;
+        /*
+         * Potential ABBA deadlock, work around it by ordering lock
+         * grabbing by pipe info address. Otherwise two different processes
+         * could deadlock (one doing tee from A -> B, the other from B -> A).
+         */
+        pipe_double_lock(ipipe, opipe);
+        do {
+                if (!opipe->readers) {
+                        send_sig(SIGPIPE, current, 0);
+                        if (!ret)
+                                ret = -EPIPE;
+                        break;
+                }
+                if (!ipipe->nrbufs && !ipipe->writers)
+                        break;
+                /*
+                 * Cannot make any progress, because either the input
+                 * pipe is empty or the output pipe is full.
+                 */
+                if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+                        /* Already processed some buffers, break */
+                        if (ret)
+                                break;
+                        if (flags & SPLICE_F_NONBLOCK) {
+                                ret = -EAGAIN;
+                                break;
+                        }
+                        /*
+                         * We raced with another reader/writer and haven't
+                         * managed to process any buffers.  A zero return
+                         * value means EOF, so retry instead.
+                         */
+                        pipe_unlock(ipipe);
+                        pipe_unlock(opipe);
+                        goto retry;
+                }
+                ibuf = ipipe->bufs + ipipe->curbuf;
+                nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+                obuf = opipe->bufs + nbuf;
+                if (len >= ibuf->len) {
+                        /*
+                         * Simply move the whole buffer from ipipe to opipe
+                         */
+                        *obuf = *ibuf;
+                        ibuf->ops = NULL;
+                        opipe->nrbufs++;
+                        ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+                        ipipe->nrbufs--;
+                        input_wakeup = true;
+                } else {
+                        /*
+                         * Get a reference to this pipe buffer,
+                         * so we can copy the contents over.
+                         */
+                        ibuf->ops->get(ipipe, ibuf);
+                        *obuf = *ibuf;
+                        /*
+                         * Don't inherit the gift flag, we need to
+                         * prevent multiple steals of this page.
+                         */
+                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+                        obuf->len = len;
+                        opipe->nrbufs++;
+                        ibuf->offset += obuf->len;
+                        ibuf->len -= obuf->len;
+                }
+                ret += obuf->len;
+                len -= obuf->len;
+        } while (len);
+        pipe_unlock(ipipe);
+        pipe_unlock(opipe);
+        /*
+         * If we put data in the output pipe, wakeup any potential readers.
+         */
+        if (ret > 0) {
+                smp_mb();
+                if (waitqueue_active(&opipe->wait))
+                        wake_up_interruptible(&opipe->wait);
+                kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+        }
+        if (input_wakeup)
+                wakeup_pipe_writers(ipipe);
+        return ret;
+}
+/*
 * Link contents of ipipe to opipe.
 */
 static int link_pipe(struct pipe_inode_info *ipipe,
@@ -1690,9 +1988,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
                 * Keep going, unless we encounter an error. The ipipe/opipe
                 * ordering doesn't really matter.
                 */
-                ret = link_ipipe_prep(ipipe, flags);
+                ret = ipipe_prep(ipipe, flags);
                if (!ret) {
-                        ret = link_opipe_prep(opipe, flags);
+                        ret = opipe_prep(opipe, flags);
                        if (!ret)
                                ret = link_pipe(ipipe, opipe, len, flags);
                }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 0adc624c956f..3b52770f46ff 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -338,6 +338,8 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 static void squashfs_put_super(struct super_block *sb)
 {
+        lock_kernel();
        if (sb->s_fs_info) {
                struct squashfs_sb_info *sbi = sb->s_fs_info;
                squashfs_cache_delete(sbi->block_cache);
@@ -350,6 +352,8 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
+        unlock_kernel();
 }
diff --git a/fs/super.c b/fs/super.c
index 1943fdf655fa..83b47416d006 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -28,7 +28,6 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/namei.h>
-#include <linux/buffer_head.h>          /* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -38,7 +37,6 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
-#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -72,7 +70,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
                INIT_HLIST_HEAD(&s->s_anon);
                INIT_LIST_HEAD(&s->s_inodes);
                INIT_LIST_HEAD(&s->s_dentry_lru);
-                INIT_LIST_HEAD(&s->s_async_list);
                init_rwsem(&s->s_umount);
                mutex_init(&s->s_lock);
                lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -285,38 +282,6 @@ void unlock_super(struct super_block * sb)
 EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.  Requires a second blkdev
- * flush by the caller to complete the operation.
- */
-void __fsync_super(struct super_block *sb)
-{
-        sync_inodes_sb(sb, 0);
-        vfs_dq_sync(sb);
-        lock_super(sb);
-        if (sb->s_dirt && sb->s_op->write_super)
-                sb->s_op->write_super(sb);
-        unlock_super(sb);
-        if (sb->s_op->sync_fs)
-                sb->s_op->sync_fs(sb, 1);
-        sync_blockdev(sb->s_bdev);
-        sync_inodes_sb(sb, 1);
-}
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_super(struct super_block *sb)
-{
-        __fsync_super(sb);
-        return sync_blockdev(sb->s_bdev);
-}
-EXPORT_SYMBOL_GPL(fsync_super);
 /**
 *      generic_shutdown_super  -       common helper for ->kill_sb()
 *      @sb: superblock to kill
@@ -338,21 +303,13 @@ void generic_shutdown_super(struct super_block *sb)
        if (sb->s_root) {
                shrink_dcache_for_umount(sb);
-                fsync_super(sb);
+                sync_filesystem(sb);
-                lock_super(sb);
+                get_fs_excl();
                sb->s_flags &= ~MS_ACTIVE;
-                /*
-                 * wait for asynchronous fs operations to finish before going further
-                 */
-                async_synchronize_full_domain(&sb->s_async_list);
                /* bad name - it should be evict_inodes() */
                invalidate_inodes(sb);
-                lock_kernel();
-                if (sop->write_super && sb->s_dirt)
-                        sop->write_super(sb);
                if (sop->put_super)
                        sop->put_super(sb);
@@ -362,9 +319,7 @@ void generic_shutdown_super(struct super_block *sb)
                           "Self-destruct in 5 seconds.  Have a nice day...\n",
                           sb->s_id);
                }
+                put_fs_excl();
-                unlock_kernel();
-                unlock_super(sb);
        }
        spin_lock(&sb_lock);
        /* should be initialized for __put_super_and_need_restart() */
@@ -441,16 +396,14 @@ void drop_super(struct super_block *sb)
 EXPORT_SYMBOL(drop_super);
-static inline void write_super(struct super_block *sb)
+/**
-{
+ * sync_supers - helper for periodic superblock writeback
-        lock_super(sb);
+ *
-        if (sb->s_root && sb->s_dirt)
+ * Call the write_super method if present on all dirty superblocks in
-                if (sb->s_op->write_super)
+ * the system.  This is for the periodic writeback used by most older
-                        sb->s_op->write_super(sb);
+ * filesystems.  For data integrity superblock writeback use
-        unlock_super(sb);
+ * sync_filesystems() instead.
-}
+ *
-/*
 * Note: check the dirty flag before waiting, so we don't
 * hold up the sync while mounting a device. (The newly
 * mounted device won't need syncing.)
@@ -462,12 +415,15 @@ void sync_supers(void)
        spin_lock(&sb_lock);
 restart:
        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (sb->s_dirt) {
+                if (sb->s_op->write_super && sb->s_dirt) {
                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        down_read(&sb->s_umount);
-                        write_super(sb);
+                        if (sb->s_root && sb->s_dirt)
+                                sb->s_op->write_super(sb);
                        up_read(&sb->s_umount);
                        spin_lock(&sb_lock);
                        if (__put_super_and_need_restart(sb))
                                goto restart;
@@ -476,60 +432,6 @@ restart:
        spin_unlock(&sb_lock);
 }
-/*
- * Call the ->sync_fs super_op against all filesystems which are r/w and
- * which implement it.
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync_fs
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync_fs
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
- *
- * (Fabian) Avoid sync_fs with clean fs & wait mode 0
- */
-void sync_filesystems(int wait)
-{
-        struct super_block *sb;
-        static DEFINE_MUTEX(mutex);
-        mutex_lock(&mutex);             /* Could be down_interruptible */
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_op->sync_fs)
-                        continue;
-                if (sb->s_flags & MS_RDONLY)
-                        continue;
-                sb->s_need_sync_fs = 1;
-        }
-restart:
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (!sb->s_need_sync_fs)
-                        continue;
-                sb->s_need_sync_fs = 0;
-                if (sb->s_flags & MS_RDONLY)
-                        continue;       /* hm.  Was remounted r/o meanwhile */
-                sb->s_count++;
-                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
-                async_synchronize_full_domain(&sb->s_async_list);
-                if (sb->s_root && (wait || sb->s_dirt))
-                        sb->s_op->sync_fs(sb, wait);
-                up_read(&sb->s_umount);
-                /* restart only when sb is no longer on the list */
-                spin_lock(&sb_lock);
-                if (__put_super_and_need_restart(sb))
-                        goto restart;
-        }
-        spin_unlock(&sb_lock);
-        mutex_unlock(&mutex);
-}
 /**
 *      get_super - get the superblock of a device
 *      @bdev: device to get the superblock for
@@ -616,45 +518,6 @@ out:
 }
 /**
- *      mark_files_ro - mark all files read-only
- *      @sb: superblock in question
- *
- *      All files are marked read-only.  We don't care about pending
- *      delete files so this should be used in 'force' mode only.
- */
-static void mark_files_ro(struct super_block *sb)
-{
-        struct file *f;
-retry:
-        file_list_lock();
-        list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
-                struct vfsmount *mnt;
-                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
-                       continue;
-                if (!file_count(f))
-                        continue;
-                if (!(f->f_mode & FMODE_WRITE))
-                        continue;
-                f->f_mode &= ~FMODE_WRITE;
-                if (file_check_writeable(f) != 0)
-                        continue;
-                file_release_write(f);
-                mnt = mntget(f->f_path.mnt);
-                file_list_unlock();
-                /*
-                 * This can sleep, so we can't hold
-                 * the file_list_lock() spinlock.
-                 */
-                mnt_drop_write(mnt);
-                mntput(mnt);
-                goto retry;
-        }
-        file_list_unlock();
-}
-/**
 *      do_remount_sb - asks filesystem to change mount options.
 *      @sb:    superblock in question
 *      @flags: numeric part of options
@@ -675,27 +538,31 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        if (flags & MS_RDONLY)
                acct_auto_close(sb);
        shrink_dcache_sb(sb);
-        fsync_super(sb);
+        sync_filesystem(sb);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
        if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
                if (force)
                        mark_files_ro(sb);
-                else if (!fs_may_remount_ro(sb))
+                else if (!fs_may_remount_ro(sb)) {
+                        unlock_kernel();
                        return -EBUSY;
+                }
                retval = vfs_dq_off(sb, 1);
-                if (retval < 0 && retval != -ENOSYS)
+                if (retval < 0 && retval != -ENOSYS) {
+                        unlock_kernel();
                        return -EBUSY;
+                }
        }
        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        if (sb->s_op->remount_fs) {
-                lock_super(sb);
                retval = sb->s_op->remount_fs(sb, &flags, data);
-                unlock_super(sb);
+                if (retval) {
-                if (retval)
+                        unlock_kernel();
                        return retval;
+                }
        }
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
        if (remount_rw)
@@ -711,18 +578,17 @@ static void do_emergency_remount(struct work_struct *work)
        list_for_each_entry(sb, &super_blocks, s_list) {
                sb->s_count++;
                spin_unlock(&sb_lock);
-                down_read(&sb->s_umount);
+                down_write(&sb->s_umount);
                if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
                        /*
                         * ->remount_fs needs lock_kernel().
                         *
                         * What lock protects sb->s_flags??
                         */
-                        lock_kernel();
                        do_remount_sb(sb, MS_RDONLY, NULL, 1);
-                        unlock_kernel();
                }
-                drop_super(sb);
+                up_write(&sb->s_umount);
+                put_super(sb);
                spin_lock(&sb_lock);
        }
        spin_unlock(&sb_lock);
diff --git a/fs/sync.c b/fs/sync.c
index 7abc65fbf21d..dd200025af85 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -13,38 +13,123 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include "internal.h"
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
                        SYNC_FILE_RANGE_WAIT_AFTER)
 /*
- * sync everything.  Start out by waking pdflush, because that writes back
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * all queues in parallel.
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
 */
-static void do_sync(unsigned long wait)
+static int __sync_filesystem(struct super_block *sb, int wait)
 {
-        wakeup_pdflush(0);
+        /* Avoid doing twice syncing and cache pruning for quota sync */
-        sync_inodes(0);         /* All mappings, inodes and their blockdevs */
-        vfs_dq_sync(NULL);
-        sync_supers();          /* Write the superblocks */
-        sync_filesystems(0);    /* Start syncing the filesystems */
-        sync_filesystems(wait); /* Waitingly sync the filesystems */
-        sync_inodes(wait);      /* Mappings, inodes and blockdevs, again. */
        if (!wait)
-                printk("Emergency Sync complete\n");
+                writeout_quota_sb(sb, -1);
-        if (unlikely(laptop_mode))
+        else
-                laptop_sync_completion();
+                sync_quota_sb(sb, -1);
+        sync_inodes_sb(sb, wait);
+        if (sb->s_op->sync_fs)
+                sb->s_op->sync_fs(sb, wait);
+        return __sync_blockdev(sb->s_bdev, wait);
+}
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int sync_filesystem(struct super_block *sb)
+{
+        int ret;
+        /*
+         * We need to be protected against the filesystem going from
+         * r/o to r/w or vice versa.
+         */
+        WARN_ON(!rwsem_is_locked(&sb->s_umount));
+        /*
+         * No point in syncing out anything if the filesystem is read-only.
+         */
+        if (sb->s_flags & MS_RDONLY)
+                return 0;
+        ret = __sync_filesystem(sb, 0);
+        if (ret < 0)
+                return ret;
+        return __sync_filesystem(sb, 1);
+}
+EXPORT_SYMBOL_GPL(sync_filesystem);
+/*
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
+ *
+ * This operation is careful to avoid the livelock which could easily happen
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
+ * is used only here.  We set it against all filesystems and then clear it as
+ * we sync them.  So redirtied filesystems are skipped.
+ *
+ * But if process A is currently running sync_filesystems and then process B
+ * calls sync_filesystems as well, process B will set all the s_need_sync
+ * flags again, which will cause process A to resync everything.  Fix that with
+ * a local mutex.
+ */
+static void sync_filesystems(int wait)
+{
+        struct super_block *sb;
+        static DEFINE_MUTEX(mutex);
+        mutex_lock(&mutex);             /* Could be down_interruptible */
+        spin_lock(&sb_lock);
+        list_for_each_entry(sb, &super_blocks, s_list)
+                sb->s_need_sync = 1;
+restart:
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                if (!sb->s_need_sync)
+                        continue;
+                sb->s_need_sync = 0;
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+                        __sync_filesystem(sb, wait);
+                up_read(&sb->s_umount);
+                /* restart only when sb is no longer on the list */
+                spin_lock(&sb_lock);
+                if (__put_super_and_need_restart(sb))
+                        goto restart;
+        }
+        spin_unlock(&sb_lock);
+        mutex_unlock(&mutex);
 }
 SYSCALL_DEFINE0(sync)
 {
-        do_sync(1);
+        sync_filesystems(0);
+        sync_filesystems(1);
+        if (unlikely(laptop_mode))
+                laptop_sync_completion();
        return 0;
 }
 static void do_sync_work(struct work_struct *work)
 {
-        do_sync(0);
+        /*
+         * Sync twice to reduce the possibility we skipped some inodes / pages
+         * because they were temporarily locked
+         */
+        sync_filesystems(0);
+        sync_filesystems(0);
+        printk("Emergency Sync complete\n");
        kfree(work);
 }
@@ -75,10 +160,8 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
        /* sync the superblock to buffers */
        sb = inode->i_sb;
-        lock_super(sb);
        if (sb->s_dirt && sb->s_op->write_super)
                sb->s_op->write_super(sb);
-        unlock_super(sb);
        /* .. finally sync the buffers to disk */
        err = sync_blockdev(sb->s_bdev);
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 56f655254bfe..c7798079e644 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ static int sysv_readdir(struct file *, void *, filldir_t);
 const struct file_operations sysv_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = sysv_readdir,
-        .fsync          = sysv_sync_file,
+        .fsync          = simple_fsync,
 };
 static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 589be21d884e..96340c01f4a7 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = sysv_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -34,18 +34,3 @@ const struct inode_operations sysv_file_inode_operations = {
        .truncate       = sysv_truncate,
        .getattr        = sysv_getattr,
 };
-int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        
-        err |= sysv_sync_inode(inode);
-        return err ? -EIO : 0;
-}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index da20b48d350f..479923456a54 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -31,15 +31,13 @@
 #include <asm/byteorder.h>
 #include "sysv.h"
-/* This is only called on sync() and umount(), when s_dirt=1. */
+static int sysv_sync_fs(struct super_block *sb, int wait)
-static void sysv_write_super(struct super_block *sb)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
        unsigned long time = get_seconds(), old_time;
+        lock_super(sb);
        lock_kernel();
-        if (sb->s_flags & MS_RDONLY)
-                goto clean;
        /*
         * If we are going to write out the super block,
@@ -53,18 +51,30 @@ static void sysv_write_super(struct super_block *sb)
                *sbi->s_sb_time = cpu_to_fs32(sbi, time);
                mark_buffer_dirty(sbi->s_bh2);
        }
-clean:
-        sb->s_dirt = 0;
        unlock_kernel();
+        unlock_super(sb);
+        return 0;
+}
+static void sysv_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                sysv_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        lock_super(sb);
        if (sbi->s_forced_ro)
                *flags |= MS_RDONLY;
        if (!(*flags & MS_RDONLY))
                sb->s_dirt = 1;
+        unlock_super(sb);
        return 0;
 }
@@ -72,6 +82,11 @@ static void sysv_put_super(struct super_block *sb)
 {
        struct sysv_sb_info *sbi = SYSV_SB(sb);
+        lock_kernel();
+        if (sb->s_dirt)
+                sysv_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY)) {
                /* XXX ext2 also updates the state here */
                mark_buffer_dirty(sbi->s_bh1);
@@ -84,6 +99,8 @@ static void sysv_put_super(struct super_block *sb)
                brelse(sbi->s_bh2);
        kfree(sbi);
+        unlock_kernel();
 }
 static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -236,7 +253,7 @@ bad_inode:
        return ERR_PTR(-EIO);
 }
-static struct buffer_head * sysv_update_inode(struct inode * inode)
+int sysv_write_inode(struct inode *inode, int wait)
 {
        struct super_block * sb = inode->i_sb;
        struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -244,19 +261,21 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
        struct sysv_inode * raw_inode;
        struct sysv_inode_info * si;
        unsigned int ino, block;
+        int err = 0;
        ino = inode->i_ino;
        if (!ino || ino > sbi->s_ninodes) {
                printk("Bad inode number on dev %s: %d is out of range\n",
                       inode->i_sb->s_id, ino);
-                return NULL;
+                return -EIO;
        }
        raw_inode = sysv_raw_inode(sb, ino, &bh);
        if (!raw_inode) {
                printk("unable to read i-node block\n");
-                return NULL;
+                return -EIO;
        }
+        lock_kernel();
        raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
        raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid));
        raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid));
@@ -272,38 +291,23 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
        for (block = 0; block < 10+1+1+1; block++)
                write3byte(sbi, (u8 *)&si->i_data[block],
                        &raw_inode->i_data[3*block]);
+        unlock_kernel();
        mark_buffer_dirty(bh);
-        return bh;
+        if (wait) {
-}
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-int sysv_write_inode(struct inode * inode, int wait)
+                        printk ("IO error syncing sysv inode [%s:%08x]\n",
-{
+                                sb->s_id, ino);
-        struct buffer_head *bh;
+                        err = -EIO;
-        lock_kernel();
+                }
-        bh = sysv_update_inode(inode);
+        }
        brelse(bh);
-        unlock_kernel();
        return 0;
 }
-int sysv_sync_inode(struct inode * inode)
+int sysv_sync_inode(struct inode *inode)
 {
-        int err = 0;
+        return sysv_write_inode(inode, 1);
-        struct buffer_head *bh;
-        bh = sysv_update_inode(inode);
-        if (bh && buffer_dirty(bh)) {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                        printk ("IO error syncing sysv inode [%s:%08lx]\n",
-                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
-                }
-        }
-        else if (!bh)
-                err = -1;
-        brelse (bh);
-        return err;
 }
 static void sysv_delete_inode(struct inode *inode)
@@ -347,6 +351,7 @@ const struct super_operations sysv_sops = {
        .delete_inode   = sysv_delete_inode,
        .put_super      = sysv_put_super,
        .write_super    = sysv_write_super,
+        .sync_fs        = sysv_sync_fs,
        .remount_fs     = sysv_remount,
        .statfs         = sysv_statfs,
 };
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 5784a318c883..53786eb5cf60 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -144,7 +144,6 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, int);
 extern int sysv_sync_inode(struct inode *);
-extern int sysv_sync_file(struct file *, struct dentry *, int);
 extern void sysv_set_inode(struct inode *, dev_t);
 extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int sysv_init_icache(void);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e9f7a754c4f7..3589eab02a2f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,7 @@
 #include <linux/mount.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
+#include <linux/smp_lock.h>
 #include "ubifs.h"
 /*
@@ -447,9 +448,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
        if (!wait)
                return 0;
-        if (sb->s_flags & MS_RDONLY)
-                return 0;
        /*
         * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
         * pages, so synchronize them first, then commit the journal. Strictly
@@ -1687,6 +1685,9 @@ static void ubifs_put_super(struct super_block *sb)
        ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
                  c->vi.vol_id);
+        lock_kernel();
        /*
         * The following asserts are only valid if there has not been a failure
         * of the media. For example, there will be dirty inodes if we failed
@@ -1753,6 +1754,8 @@ static void ubifs_put_super(struct super_block *sb)
        ubi_close_volume(c->ubi);
        mutex_unlock(&c->umount_mutex);
        kfree(c);
+        unlock_kernel();
 }
 static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1768,17 +1771,22 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
                return err;
        }
+        lock_kernel();
        if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
                if (c->ro_media) {
                        ubifs_msg("cannot re-mount due to prior errors");
+                        unlock_kernel();
                        return -EROFS;
                }
                err = ubifs_remount_rw(c);
-                if (err)
+                if (err) {
+                        unlock_kernel();
                        return err;
+                }
        } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
                if (c->ro_media) {
                        ubifs_msg("cannot re-mount due to prior errors");
+                        unlock_kernel();
                        return -EROFS;
                }
                ubifs_remount_ro(c);
@@ -1793,6 +1801,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
        }
        ubifs_assert(c->lst.taken_empty_lebs > 0);
+        unlock_kernel();
        return 0;
 }
diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index 0d4503f7446d..eb880f66c23a 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_UDF_FS) += udf.o
 udf-objs     := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
-                partition.o super.o truncate.o symlink.o fsync.o \
+                partition.o super.o truncate.o symlink.o \
                directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 2efd4d5291b6..61d9a76a3a69 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = {
        .read                   = generic_read_dir,
        .readdir                = udf_readdir,
        .ioctl                  = udf_ioctl,
-        .fsync                  = udf_fsync_file,
+        .fsync                  = simple_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index eb91f3b70320..7464305382b5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = {
        .write                  = do_sync_write,
        .aio_write              = udf_file_aio_write,
        .release                = udf_release_file,
-        .fsync                  = udf_fsync_file,
+        .fsync                  = simple_fsync,
        .splice_read            = generic_file_splice_read,
        .llseek                 = generic_file_llseek,
 };
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
deleted file mode 100644
index b2c472b733b8..000000000000
--- a/fs/udf/fsync.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * fsync.c
- *
- * PURPOSE
- *  Fsync handling routines for the OSTA-UDF(tm) filesystem.
- *
- * COPYRIGHT
- *  This file is distributed under the terms of the GNU General Public
- *  License (GPL). Copies of the GPL can be obtained from:
- *      ftp://prep.ai.mit.edu/pub/gnu/GPL
- *  Each contributing author retains all rights to their own work.
- *
- *  (C) 1999-2001 Ben Fennema
- *  (C) 1999-2000 Stelias Computing Inc
- *
- * HISTORY
- *
- *  05/22/99 blf  Created.
- */
-#include "udfdecl.h"
-#include <linux/fs.h>
-static int udf_fsync_inode(struct inode *, int);
-/*
- *      File may be NULL when we are called. Perhaps we shouldn't
- *      even pass file to fsync ?
- */
-int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        return udf_fsync_inode(inode, datasync);
-}
-static int udf_fsync_inode(struct inode *inode, int datasync)
-{
-        int err;
-        err = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return err;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return err;
-        err |= udf_sync_inode(inode);
-        return err ? -EIO : 0;
-}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc855a4..6832135159b6 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -568,6 +568,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
        if (!udf_parse_options(options, &uopt, true))
                return -EINVAL;
+        lock_kernel();
        sbi->s_flags = uopt.flags;
        sbi->s_uid   = uopt.uid;
        sbi->s_gid   = uopt.gid;
@@ -581,13 +582,16 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                        *flags |= MS_RDONLY;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+                unlock_kernel();
                return 0;
+        }
        if (*flags & MS_RDONLY)
                udf_close_lvid(sb);
        else
                udf_open_lvid(sb);
+        unlock_kernel();
        return 0;
 }
@@ -1915,7 +1919,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
        } else {
-                uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+                uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
                ret = udf_load_vrs(sb, &uopt, silent, &fileset);
                if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
                        if (!silent)
@@ -2062,6 +2066,9 @@ static void udf_put_super(struct super_block *sb)
        struct udf_sb_info *sbi;
        sbi = UDF_SB(sb);
+        lock_kernel();
        if (sbi->s_vat_inode)
                iput(sbi->s_vat_inode);
        if (sbi->s_partitions)
@@ -2077,6 +2084,8 @@ static void udf_put_super(struct super_block *sb)
        kfree(sbi->s_partmaps);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
+        unlock_kernel();
 }
 static int udf_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index cac51b77a5d1..8d46f4294ee7 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -223,9 +223,6 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
                         uint32_t, int *);
-/* fsync.c */
-extern int udf_fsync_file(struct file *, struct dentry *, int);
 /* directory.c */
 extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
                                                struct udf_fileident_bh *,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6321b797061b..6f671f1ac271 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = ufs_readdir,
-        .fsync          = ufs_sync_file,
+        .fsync          = simple_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 2bd3a1615714..73655c61240a 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,31 +24,10 @@
 */
 #include <linux/fs.h>
-#include <linux/buffer_head.h>  /* for sync_mapping_buffers() */
 #include "ufs_fs.h"
 #include "ufs.h"
-int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-        struct inode *inode = dentry->d_inode;
-        int err;
-        int ret;
-        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
-                return ret;
-        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                return ret;
-        err = ufs_sync_inode(inode);
-        if (ret == 0)
-                ret = err;
-        return ret;
-}
 /*
 * We have mostly NULL's here: the current defaults are ok for
 * the ufs filesystem.
@@ -62,6 +41,6 @@ const struct file_operations ufs_file_operations = {
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,
-        .fsync          = ufs_sync_file,
+        .fsync          = simple_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 60359291761f..5faed7954d0a 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -263,6 +263,7 @@ void ufs_panic (struct super_block * sb, const char * function,
        struct ufs_super_block_first * usb1;
        va_list args;
        
+        lock_kernel();
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        
@@ -594,6 +595,9 @@ static void ufs_put_super_internal(struct super_block *sb)
        
        UFSD("ENTER\n");
+        lock_kernel();
        ufs_put_cstotal(sb);
        size = uspi->s_cssize;
        blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -621,6 +625,9 @@ static void ufs_put_super_internal(struct super_block *sb)
                brelse (sbi->s_ucg[i]);
        kfree (sbi->s_ucg);
        kfree (base);
+        unlock_kernel();
        UFSD("EXIT\n");
 }
@@ -1118,32 +1125,45 @@ failed_nomem:
        return -ENOMEM;
 }
-static void ufs_write_super(struct super_block *sb)
+static int ufs_sync_fs(struct super_block *sb, int wait)
 {
        struct ufs_sb_private_info * uspi;
        struct ufs_super_block_first * usb1;
        struct ufs_super_block_third * usb3;
        unsigned flags;
+        lock_super(sb);
        lock_kernel();
        UFSD("ENTER\n");
        flags = UFS_SB(sb)->s_flags;
        uspi = UFS_SB(sb)->s_uspi;
        usb1 = ubh_get_usb_first(uspi);
        usb3 = ubh_get_usb_third(uspi);
-        if (!(sb->s_flags & MS_RDONLY)) {
+        usb1->fs_time = cpu_to_fs32(sb, get_seconds());
-                usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+        if ((flags & UFS_ST_MASK) == UFS_ST_SUN  ||
-                if ((flags & UFS_ST_MASK) == UFS_ST_SUN 
+            (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
-                  || (flags & UFS_ST_MASK) == UFS_ST_SUNOS
+            (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-                  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+                ufs_set_fs_state(sb, usb1, usb3,
-                        ufs_set_fs_state(sb, usb1, usb3,
+                                UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-                                        UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+        ufs_put_cstotal(sb);
-                ufs_put_cstotal(sb);
-        }
        sb->s_dirt = 0;
        UFSD("EXIT\n");
        unlock_kernel();
+        unlock_super(sb);
+        return 0;
+}
+static void ufs_write_super(struct super_block *sb)
+{
+        if (!(sb->s_flags & MS_RDONLY))
+                ufs_sync_fs(sb, 1);
+        else
+                sb->s_dirt = 0;
 }
 static void ufs_put_super(struct super_block *sb)
@@ -1152,6 +1172,9 @@ static void ufs_put_super(struct super_block *sb)
                
        UFSD("ENTER\n");
+        if (sb->s_dirt)
+                ufs_write_super(sb);
        if (!(sb->s_flags & MS_RDONLY))
                ufs_put_super_internal(sb);
        
@@ -1171,7 +1194,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        struct ufs_super_block_third * usb3;
        unsigned new_mount_opt, ufstype;
        unsigned flags;
-        
+        lock_kernel();
+        lock_super(sb);
        uspi = UFS_SB(sb)->s_uspi;
        flags = UFS_SB(sb)->s_flags;
        usb1 = ubh_get_usb_first(uspi);
@@ -1184,17 +1209,24 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
        ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
        new_mount_opt = 0;
        ufs_set_opt (new_mount_opt, ONERROR_LOCK);
-        if (!ufs_parse_options (data, &new_mount_opt))
+        if (!ufs_parse_options (data, &new_mount_opt)) {
+                unlock_super(sb);
+                unlock_kernel();
                return -EINVAL;
+        }
        if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
                new_mount_opt |= ufstype;
        } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
                printk("ufstype can't be changed during remount\n");
+                unlock_super(sb);
+                unlock_kernel();
                return -EINVAL;
        }
        if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                UFS_SB(sb)->s_mount_opt = new_mount_opt;
+                unlock_super(sb);
+                unlock_kernel();
                return 0;
        }
        
@@ -1219,6 +1251,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 #ifndef CONFIG_UFS_FS_WRITE
                printk("ufs was compiled with read-only support, "
                "can't be mounted as read-write\n");
+                unlock_super(sb);
+                unlock_kernel();
                return -EINVAL;
 #else
                if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1227,16 +1261,22 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
                    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
                    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
                        printk("this ufstype is read-only supported\n");
+                        unlock_super(sb);
+                        unlock_kernel();
                        return -EINVAL;
                }
                if (!ufs_read_cylinder_structures(sb)) {
                        printk("failed during remounting\n");
+                        unlock_super(sb);
+                        unlock_kernel();
                        return -EPERM;
                }
                sb->s_flags &= ~MS_RDONLY;
 #endif
        }
        UFS_SB(sb)->s_mount_opt = new_mount_opt;
+        unlock_super(sb);
+        unlock_kernel();
        return 0;
 }
@@ -1352,6 +1392,7 @@ static const struct super_operations ufs_super_ops = {
        .delete_inode   = ufs_delete_inode,
        .put_super      = ufs_put_super,
        .write_super    = ufs_write_super,
+        .sync_fs        = ufs_sync_fs,
        .statfs         = ufs_statfs,
        .remount_fs     = ufs_remount,
        .show_options   = ufs_show_options,
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index d0c4acd4f1f3..644e77e13599 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -99,7 +99,6 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 extern const struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
 extern const struct address_space_operations ufs_aops;
-extern int ufs_sync_file(struct file *, struct dentry *, int);
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
diff --git a/fs/xattr.c b/fs/xattr.c
index d51b8f9db921..1c3d0af59ddf 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                return error;
        dentry = f->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = mnt_want_write(f->f_path.mnt);
+        error = mnt_want_write_file(f);
        if (!error) {
                error = setxattr(dentry, name, value, size, flags);
                mnt_drop_write(f->f_path.mnt);
@@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
                return error;
        dentry = f->f_path.dentry;
        audit_inode(NULL, dentry);
-        error = mnt_want_write(f->f_path.mnt);
+        error = mnt_want_write_file(f);
        if (!error) {
                error = removexattr(dentry, name);
                mnt_drop_write(f->f_path.mnt);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 29228f5899cd..480f28127f09 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -39,6 +39,7 @@ config XFS_QUOTA
 config XFS_POSIX_ACL
        bool "XFS POSIX ACL support"
        depends on XFS_FS
+        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
          groups beyond the owner/group/world scheme.
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 60f107e47fe9..7a59daed1782 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -40,7 +40,7 @@ xfs-$(CONFIG_PROC_FS)		+= quota/xfs_qm_stats.o
 endif
 xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
+xfs-$(CONFIG_XFS_POSIX_ACL)     += $(XFS_LINUX)/xfs_acl.o
 xfs-$(CONFIG_PROC_FS)           += $(XFS_LINUX)/xfs_stats.o
 xfs-$(CONFIG_SYSCTL)            += $(XFS_LINUX)/xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)            += $(XFS_LINUX)/xfs_ioctl32.o
@@ -88,8 +88,7 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_utils.o \
                                   xfs_vnodeops.o \
                                   xfs_rw.o \
-                                   xfs_dmops.o \
+                                   xfs_dmops.o
-                                   xfs_qmops.o
 xfs-$(CONFIG_XFS_TRACE)         += xfs_btree_trace.o \
                                   xfs_dir2_trace.o
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
new file mode 100644
index 000000000000..1e9d1246eebc
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_vnodeops.h"
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#define XFS_ACL_NOT_CACHED      ((void *)-1)
+/*
+ * Locking scheme:
+ *  - all ACL updates are protected by inode->i_mutex, which is taken before
+ *    calling into this file.
+ *  - access and updates to the ip->i_acl and ip->i_default_acl pointers are
+ *    protected by inode->i_lock.
+ */
+STATIC struct posix_acl *
+xfs_acl_from_disk(struct xfs_acl *aclp)
+{
+        struct posix_acl_entry *acl_e;
+        struct posix_acl *acl;
+        struct xfs_acl_entry *ace;
+        int count, i;
+        count = be32_to_cpu(aclp->acl_cnt);
+        acl = posix_acl_alloc(count, GFP_KERNEL);
+        if (!acl)
+                return ERR_PTR(-ENOMEM);
+        for (i = 0; i < count; i++) {
+                acl_e = &acl->a_entries[i];
+                ace = &aclp->acl_entry[i];
+                /*
+                 * The tag is 32 bits on disk and 16 bits in core.
+                 *
+                 * Because every access to it goes through the core
+                 * format first this is not a problem.
+                 */
+                acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+                acl_e->e_perm = be16_to_cpu(ace->ae_perm);
+                switch (acl_e->e_tag) {
+                case ACL_USER:
+                case ACL_GROUP:
+                        acl_e->e_id = be32_to_cpu(ace->ae_id);
+                        break;
+                case ACL_USER_OBJ:
+                case ACL_GROUP_OBJ:
+                case ACL_MASK:
+                case ACL_OTHER:
+                        acl_e->e_id = ACL_UNDEFINED_ID;
+                        break;
+                default:
+                        goto fail;
+                }
+        }
+        return acl;
+fail:
+        posix_acl_release(acl);
+        return ERR_PTR(-EINVAL);
+}
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
+{
+        const struct posix_acl_entry *acl_e;
+        struct xfs_acl_entry *ace;
+        int i;
+        aclp->acl_cnt = cpu_to_be32(acl->a_count);
+        for (i = 0; i < acl->a_count; i++) {
+                ace = &aclp->acl_entry[i];
+                acl_e = &acl->a_entries[i];
+                ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+                ace->ae_id = cpu_to_be32(acl_e->e_id);
+                ace->ae_perm = cpu_to_be16(acl_e->e_perm);
+        }
+}
+/*
+ * Update the cached ACL pointer in the inode.
+ *
+ * Because we don't hold any locks while reading/writing the attribute
+ * from/to disk another thread could have raced and updated the cached
+ * ACL value before us. In that case we release the previous cached value
+ * and update it with our new value.
+ */
+STATIC void
+xfs_update_cached_acl(struct inode *inode, struct posix_acl **p_acl,
+                struct posix_acl *acl)
+{
+        spin_lock(&inode->i_lock);
+        if (*p_acl && *p_acl != XFS_ACL_NOT_CACHED)
+                posix_acl_release(*p_acl);
+        *p_acl = posix_acl_dup(acl);
+        spin_unlock(&inode->i_lock);
+}
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        struct posix_acl *acl = NULL, **p_acl;
+        struct xfs_acl *xfs_acl;
+        int len = sizeof(struct xfs_acl);
+        char *ea_name;
+        int error;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                ea_name = SGI_ACL_FILE;
+                p_acl = &ip->i_acl;
+                break;
+        case ACL_TYPE_DEFAULT:
+                ea_name = SGI_ACL_DEFAULT;
+                p_acl = &ip->i_default_acl;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        spin_lock(&inode->i_lock);
+        if (*p_acl != XFS_ACL_NOT_CACHED)
+                acl = posix_acl_dup(*p_acl);
+        spin_unlock(&inode->i_lock);
+        /*
+         * If we have a cached ACLs value just return it, not need to
+         * go out to the disk.
+         */
+        if (acl)
+                return acl;
+        xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+        if (!xfs_acl)
+                return ERR_PTR(-ENOMEM);
+        error = -xfs_attr_get(ip, ea_name, (char *)xfs_acl, &len, ATTR_ROOT);
+        if (error) {
+                /*
+                 * If the attribute doesn't exist make sure we have a negative
+                 * cache entry, for any other error assume it is transient and
+                 * leave the cache entry as XFS_ACL_NOT_CACHED.
+                 */
+                if (error == -ENOATTR) {
+                        acl = NULL;
+                        goto out_update_cache;
+                }
+                goto out;
+        }
+        acl = xfs_acl_from_disk(xfs_acl);
+        if (IS_ERR(acl))
+                goto out;
+ out_update_cache:
+        xfs_update_cached_acl(inode, p_acl, acl);
+ out:
+        kfree(xfs_acl);
+        return acl;
+}
+STATIC int
+xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        struct posix_acl **p_acl;
+        char *ea_name;
+        int error;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                ea_name = SGI_ACL_FILE;
+                p_acl = &ip->i_acl;
+                break;
+        case ACL_TYPE_DEFAULT:
+                if (!S_ISDIR(inode->i_mode))
+                        return acl ? -EACCES : 0;
+                ea_name = SGI_ACL_DEFAULT;
+                p_acl = &ip->i_default_acl;
+                break;
+        default:
+                return -EINVAL;
+        }
+        if (acl) {
+                struct xfs_acl *xfs_acl;
+                int len;
+                xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+                if (!xfs_acl)
+                        return -ENOMEM;
+                xfs_acl_to_disk(xfs_acl, acl);
+                len = sizeof(struct xfs_acl) -
+                        (sizeof(struct xfs_acl_entry) *
+                         (XFS_ACL_MAX_ENTRIES - acl->a_count));
+                error = -xfs_attr_set(ip, ea_name, (char *)xfs_acl,
+                                len, ATTR_ROOT);
+                kfree(xfs_acl);
+        } else {
+                /*
+                 * A NULL ACL argument means we want to remove the ACL.
+                 */
+                error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+                /*
+                 * If the attribute didn't exist to start with that's fine.
+                 */
+                if (error == -ENOATTR)
+                        error = 0;
+        }
+        if (!error)
+                xfs_update_cached_acl(inode, p_acl, acl);
+        return error;
+}
+int
+xfs_check_acl(struct inode *inode, int mask)
+{
+        struct xfs_inode *ip = XFS_I(inode);
+        struct posix_acl *acl;
+        int error = -EAGAIN;
+        xfs_itrace_entry(ip);
+        /*
+         * If there is no attribute fork no ACL exists on this inode and
+         * we can skip the whole exercise.
+         */
+        if (!XFS_IFORK_Q(ip))
+                return -EAGAIN;
+        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl) {
+                error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+        }
+        return error;
+}
+static int
+xfs_set_mode(struct inode *inode, mode_t mode)
+{
+        int error = 0;
+        if (mode != inode->i_mode) {
+                struct iattr iattr;
+                iattr.ia_valid = ATTR_MODE;
+                iattr.ia_mode = mode;
+                error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+        }
+        return error;
+}
+static int
+xfs_acl_exists(struct inode *inode, char *name)
+{
+        int len = sizeof(struct xfs_acl);
+        return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
+                            ATTR_ROOT|ATTR_KERNOVAL) == 0);
+}
+int
+posix_acl_access_exists(struct inode *inode)
+{
+        return xfs_acl_exists(inode, SGI_ACL_FILE);
+}
+int
+posix_acl_default_exists(struct inode *inode)
+{
+        if (!S_ISDIR(inode->i_mode))
+                return 0;
+        return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
+}
+/*
+ * No need for i_mutex because the inode is not yet exposed to the VFS.
+ */
+int
+xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl)
+{
+        struct posix_acl *clone;
+        mode_t mode;
+        int error = 0, inherit = 0;
+        if (S_ISDIR(inode->i_mode)) {
+                error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+                if (error)
+                        return error;
+        }
+        clone = posix_acl_clone(default_acl, GFP_KERNEL);
+        if (!clone)
+                return -ENOMEM;
+        mode = inode->i_mode;
+        error = posix_acl_create_masq(clone, &mode);
+        if (error < 0)
+                goto out_release_clone;
+        /*
+         * If posix_acl_create_masq returns a positive value we need to
+         * inherit a permission that can't be represented using the Unix
+         * mode bits and we actually need to set an ACL.
+         */
+        if (error > 0)
+                inherit = 1;
+        error = xfs_set_mode(inode, mode);
+        if (error)
+                goto out_release_clone;
+        if (inherit)
+                error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+ out_release_clone:
+        posix_acl_release(clone);
+        return error;
+}
+int
+xfs_acl_chmod(struct inode *inode)
+{
+        struct posix_acl *acl, *clone;
+        int error;
+        if (S_ISLNK(inode->i_mode))
+                return -EOPNOTSUPP;
+        acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (IS_ERR(acl) || !acl)
+                return PTR_ERR(acl);
+        clone = posix_acl_clone(acl, GFP_KERNEL);
+        posix_acl_release(acl);
+        if (!clone)
+                return -ENOMEM;
+        error = posix_acl_chmod_masq(clone, inode->i_mode);
+        if (!error)
+                error = xfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+        posix_acl_release(clone);
+        return error;
+}
+void
+xfs_inode_init_acls(struct xfs_inode *ip)
+{
+        /*
+         * No need for locking, inode is not live yet.
+         */
+        ip->i_acl = XFS_ACL_NOT_CACHED;
+        ip->i_default_acl = XFS_ACL_NOT_CACHED;
+}
+void
+xfs_inode_clear_acls(struct xfs_inode *ip)
+{
+        /*
+         * No need for locking here, the inode is not live anymore
+         * and just about to be freed.
+         */
+        if (ip->i_acl != XFS_ACL_NOT_CACHED)
+                posix_acl_release(ip->i_acl);
+        if (ip->i_default_acl != XFS_ACL_NOT_CACHED)
+                posix_acl_release(ip->i_default_acl);
+}
+/*
+ * System xattr handlers.
+ *
+ * Currently Posix ACLs are the only system namespace extended attribute
+ * handlers supported by XFS, so we just implement the handlers here.
+ * If we ever support other system extended attributes this will need
+ * some refactoring.
+ */
+static int
+xfs_decode_acl(const char *name)
+{
+        if (strcmp(name, "posix_acl_access") == 0)
+                return ACL_TYPE_ACCESS;
+        else if (strcmp(name, "posix_acl_default") == 0)
+                return ACL_TYPE_DEFAULT;
+        return -EINVAL;
+}
+static int
+xfs_xattr_system_get(struct inode *inode, const char *name,
+                void *value, size_t size)
+{
+        struct posix_acl *acl;
+        int type, error;
+        type = xfs_decode_acl(name);
+        if (type < 0)
+                return type;
+        acl = xfs_get_acl(inode, type);
+        if (IS_ERR(acl))
+                return PTR_ERR(acl);
+        if (acl == NULL)
+                return -ENODATA;
+        error = posix_acl_to_xattr(acl, value, size);
+        posix_acl_release(acl);
+        return error;
+}
+static int
+xfs_xattr_system_set(struct inode *inode, const char *name,
+                const void *value, size_t size, int flags)
+{
+        struct posix_acl *acl = NULL;
+        int error = 0, type;
+        type = xfs_decode_acl(name);
+        if (type < 0)
+                return type;
+        if (flags & XATTR_CREATE)
+                return -EINVAL;
+        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+                return value ? -EACCES : 0;
+        if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+                return -EPERM;
+        if (!value)
+                goto set_acl;
+        acl = posix_acl_from_xattr(value, size);
+        if (!acl) {
+                /*
+                 * acl_set_file(3) may request that we set default ACLs with
+                 * zero length -- defend (gracefully) against that here.
+                 */
+                goto out;
+        }
+        if (IS_ERR(acl)) {
+                error = PTR_ERR(acl);
+                goto out;
+        }
+        error = posix_acl_valid(acl);
+        if (error)
+                goto out_release;
+        error = -EINVAL;
+        if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+                goto out_release;
+        if (type == ACL_TYPE_ACCESS) {
+                mode_t mode = inode->i_mode;
+                error = posix_acl_equiv_mode(acl, &mode);
+                if (error <= 0) {
+                        posix_acl_release(acl);
+                        acl = NULL;
+                        if (error < 0)
+                                return error;
+                }
+                error = xfs_set_mode(inode, mode);
+                if (error)
+                        goto out_release;
+        }
+ set_acl:
+        error = xfs_set_acl(inode, type, acl);
+ out_release:
+        posix_acl_release(acl);
+ out:
+        return error;
+}
+struct xattr_handler xfs_xattr_system_handler = {
+        .prefix = XATTR_SYSTEM_PREFIX,
+        .get    = xfs_xattr_system_get,
+        .set    = xfs_xattr_system_set,
+};
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a9f2b5..1418b916fc27 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
        struct block_device     *bdev)
 {
        return xfs_setsize_buftarg_flags(btp,
-                        PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+                        PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 int
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 34eaab608e6e..5bb523d7f37e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -41,7 +41,6 @@
 #include "xfs_itable.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
 #include "xfs_buf_item.h"
@@ -899,7 +898,8 @@ xfs_ioctl_setattr(
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
        unsigned int            lock_flags = 0;
-        struct xfs_dquot        *udqp = NULL, *gdqp = NULL;
+        struct xfs_dquot        *udqp = NULL;
+        struct xfs_dquot        *gdqp = NULL;
        struct xfs_dquot        *olddquot = NULL;
        int                     code;
@@ -919,7 +919,7 @@ xfs_ioctl_setattr(
         * because the i_*dquot fields will get updated anyway.
         */
        if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
-                code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid,
+                code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
                                         ip->i_d.di_gid, fa->fsx_projid,
                                         XFS_QMOPT_PQUOTA, &udqp, &gdqp);
                if (code)
@@ -954,10 +954,11 @@ xfs_ioctl_setattr(
         * Do a quota reservation only if projid is actually going to change.
         */
        if (mask & FSX_PROJID) {
-                if (XFS_IS_PQUOTA_ON(mp) &&
+                if (XFS_IS_QUOTA_RUNNING(mp) &&
+                    XFS_IS_PQUOTA_ON(mp) &&
                    ip->i_d.di_projid != fa->fsx_projid) {
                        ASSERT(tp);
-                        code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
                                                capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (code)       /* out of quota */
@@ -1059,8 +1060,8 @@ xfs_ioctl_setattr(
                 * in the transaction.
                 */
                if (ip->i_d.di_projid != fa->fsx_projid) {
-                        if (XFS_IS_PQUOTA_ON(mp)) {
+                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
-                                olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+                                olddquot = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_gdquot, gdqp);
                        }
                        ip->i_d.di_projid = fa->fsx_projid;
@@ -1106,9 +1107,9 @@ xfs_ioctl_setattr(
        /*
         * Release any dquot(s) the inode had kept before chown.
         */
-        XFS_QM_DQRELE(mp, olddquot);
+        xfs_qm_dqrele(olddquot);
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        if (code)
                return code;
@@ -1122,8 +1123,8 @@ xfs_ioctl_setattr(
        return 0;
 error_return:
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        xfs_trans_cancel(tp, 0);
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 6075382336d7..58973bb46038 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -17,6 +17,7 @@
 */
 #include "xfs.h"
 #include "xfs_fs.h"
+#include "xfs_acl.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -51,6 +52,7 @@
 #include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/namei.h>
+#include <linux/posix_acl.h>
 #include <linux/security.h>
 #include <linux/falloc.h>
 #include <linux/fiemap.h>
@@ -202,9 +204,8 @@ xfs_vn_mknod(
 {
        struct inode    *inode;
        struct xfs_inode *ip = NULL;
-        xfs_acl_t       *default_acl = NULL;
+        struct posix_acl *default_acl = NULL;
        struct xfs_name name;
-        int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS;
        int             error;
        /*
@@ -219,18 +220,14 @@ xfs_vn_mknod(
                rdev = 0;
        }
-        if (test_default_acl && test_default_acl(dir)) {
+        if (IS_POSIXACL(dir)) {
-                if (!_ACL_ALLOC(default_acl)) {
+                default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
-                        return -ENOMEM;
+                if (IS_ERR(default_acl))
-                }
+                        return -PTR_ERR(default_acl);
-                if (!_ACL_GET_DEFAULT(dir, default_acl)) {
-                        _ACL_FREE(default_acl);
-                        default_acl = NULL;
-                }
-        }
-        if (IS_POSIXACL(dir) && !default_acl)
+                if (!default_acl)
-                mode &= ~current_umask();
+                        mode &= ~current_umask();
+        }
        xfs_dentry_to_name(&name, dentry);
        error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
@@ -244,10 +241,10 @@ xfs_vn_mknod(
                goto out_cleanup_inode;
        if (default_acl) {
-                error = _ACL_INHERIT(inode, mode, default_acl);
+                error = -xfs_inherit_acl(inode, default_acl);
                if (unlikely(error))
                        goto out_cleanup_inode;
-                _ACL_FREE(default_acl);
+                posix_acl_release(default_acl);
        }
@@ -257,8 +254,7 @@ xfs_vn_mknod(
 out_cleanup_inode:
        xfs_cleanup_inode(dir, inode, dentry);
 out_free_acl:
-        if (default_acl)
+        posix_acl_release(default_acl);
-                _ACL_FREE(default_acl);
        return -error;
 }
@@ -488,26 +484,6 @@ xfs_vn_put_link(
                kfree(s);
 }
-#ifdef CONFIG_XFS_POSIX_ACL
-STATIC int
-xfs_check_acl(
-        struct inode            *inode,
-        int                     mask)
-{
-        struct xfs_inode        *ip = XFS_I(inode);
-        int                     error;
-        xfs_itrace_entry(ip);
-        if (XFS_IFORK_Q(ip)) {
-                error = xfs_acl_iaccess(ip, mask, NULL);
-                if (error != -1)
-                        return -error;
-        }
-        return -EAGAIN;
-}
 STATIC int
 xfs_vn_permission(
        struct inode            *inode,
@@ -515,9 +491,6 @@ xfs_vn_permission(
 {
        return generic_permission(inode, mask, xfs_check_acl);
 }
-#else
-#define xfs_vn_permission NULL
-#endif
 STATIC int
 xfs_vn_getattr(
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 9142192ccbe6..7078974a6eee 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_inode_item.h"
 #include "xfs_buf_item.h"
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 94d9a633d3d9..cb6e2cca214f 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -50,9 +50,11 @@ xfs_fs_quota_sync(
 {
        struct xfs_mount        *mp = XFS_M(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
        if (!XFS_IS_QUOTA_RUNNING(mp))
                return -ENOSYS;
-        return -xfs_sync_inodes(mp, SYNC_DELWRI);
+        return -xfs_sync_data(mp, 0);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bb685269f832..2e09efbca8db 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -43,7 +43,6 @@
 #include "xfs_itable.h"
 #include "xfs_fsops.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -405,6 +404,14 @@ xfs_parseargs(
                return EINVAL;
        }
+#ifndef CONFIG_XFS_QUOTA
+        if (XFS_IS_QUOTA_RUNNING(mp)) {
+                cmn_err(CE_WARN,
+                        "XFS: quota support not available in this kernel.");
+                return EINVAL;
+        }
+#endif
        if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
            (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
                cmn_err(CE_WARN,
@@ -1063,7 +1070,18 @@ xfs_fs_put_super(
        int                     unmount_event_flags = 0;
        xfs_syncd_stop(mp);
-        xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                /*
+                 * XXX(hch): this should be SYNC_WAIT.
+                 *
+                 * Or more likely not needed at all because the VFS is already
+                 * calling ->sync_fs after shutting down all filestem
+                 * operations and just before calling ->put_super.
+                 */
+                xfs_sync_data(mp, 0);
+                xfs_sync_attr(mp, 0);
+        }
 #ifdef HAVE_DMAPI
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1098,21 +1116,11 @@ xfs_fs_put_super(
        xfs_freesb(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
-        xfs_qmops_put(mp);
        xfs_dmops_put(mp);
        xfs_free_fsname(mp);
        kfree(mp);
 }
-STATIC void
-xfs_fs_write_super(
-        struct super_block      *sb)
-{
-        if (!(sb->s_flags & MS_RDONLY))
-                xfs_sync_fsdata(XFS_M(sb), 0);
-        sb->s_dirt = 0;
-}
 STATIC int
 xfs_fs_sync_super(
        struct super_block      *sb,
@@ -1137,7 +1145,6 @@ xfs_fs_sync_super(
                error = xfs_quiesce_data(mp);
        else
                error = xfs_sync_fsdata(mp, 0);
-        sb->s_dirt = 0;
        if (unlikely(laptop_mode)) {
                int     prev_sync_seq = mp->m_sync_seq;
@@ -1168,6 +1175,7 @@ xfs_fs_statfs(
 {
        struct xfs_mount        *mp = XFS_M(dentry->d_sb);
        xfs_sb_t                *sbp = &mp->m_sb;
+        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
        __uint64_t              fakeinos, id;
        xfs_extlen_t            lsize;
@@ -1196,7 +1204,10 @@ xfs_fs_statfs(
        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
        spin_unlock(&mp->m_sb_lock);
-        XFS_QM_DQSTATVFS(XFS_I(dentry->d_inode), statp);
+        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
+            ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
+                              (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+                xfs_qm_statvfs(ip, statp);
        return 0;
 }
@@ -1404,16 +1415,13 @@ xfs_fs_fill_super(
        error = xfs_dmops_get(mp);
        if (error)
                goto out_free_fsname;
-        error = xfs_qmops_get(mp);
-        if (error)
-                goto out_put_dmops;
        if (silent)
                flags |= XFS_MFSI_QUIET;
        error = xfs_open_devices(mp);
        if (error)
-                goto out_put_qmops;
+                goto out_put_dmops;
        if (xfs_icsb_init_counters(mp))
                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
@@ -1443,7 +1451,6 @@ xfs_fs_fill_super(
        XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
-        sb->s_dirt = 1;
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1482,8 +1489,6 @@ xfs_fs_fill_super(
 out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
- out_put_qmops:
-        xfs_qmops_put(mp);
 out_put_dmops:
        xfs_dmops_put(mp);
 out_free_fsname:
@@ -1533,7 +1538,6 @@ static struct super_operations xfs_super_operations = {
        .write_inode            = xfs_fs_write_inode,
        .clear_inode            = xfs_fs_clear_inode,
        .put_super              = xfs_fs_put_super,
-        .write_super            = xfs_fs_write_super,
        .sync_fs                = xfs_fs_sync_super,
        .freeze_fs              = xfs_fs_freeze,
        .statfs                 = xfs_fs_statfs,
@@ -1718,18 +1722,8 @@ xfs_init_zones(void)
        if (!xfs_ili_zone)
                goto out_destroy_inode_zone;
-#ifdef CONFIG_XFS_POSIX_ACL
-        xfs_acl_zone = kmem_zone_init(sizeof(xfs_acl_t), "xfs_acl");
-        if (!xfs_acl_zone)
-                goto out_destroy_ili_zone;
-#endif
        return 0;
-#ifdef CONFIG_XFS_POSIX_ACL
- out_destroy_ili_zone:
-#endif
-        kmem_zone_destroy(xfs_ili_zone);
 out_destroy_inode_zone:
        kmem_zone_destroy(xfs_inode_zone);
 out_destroy_efi_zone:
@@ -1763,9 +1757,6 @@ xfs_init_zones(void)
 STATIC void
 xfs_destroy_zones(void)
 {
-#ifdef CONFIG_XFS_POSIX_ACL
-        kmem_zone_destroy(xfs_acl_zone);
-#endif
        kmem_zone_destroy(xfs_ili_zone);
        kmem_zone_destroy(xfs_inode_zone);
        kmem_zone_destroy(xfs_efi_zone);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index f7ba76633c29..b619d6b8ca43 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -43,166 +43,267 @@
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_rw.h"
+#include "xfs_quota.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-/*
- * Sync all the inodes in the given AG according to the
- * direction given by the flags.
- */
-STATIC int
-xfs_sync_inodes_ag(
-        xfs_mount_t     *mp,
-        int             ag,
-        int             flags)
-{
-        xfs_perag_t     *pag = &mp->m_perag[ag];
-        int             nr_found;
-        uint32_t        first_index = 0;
-        int             error = 0;
-        int             last_error = 0;
-        do {
+STATIC xfs_inode_t *
-                struct inode    *inode;
+xfs_inode_ag_lookup(
-                xfs_inode_t     *ip = NULL;
+        struct xfs_mount        *mp,
-                int             lock_flags = XFS_ILOCK_SHARED;
+        struct xfs_perag        *pag,
+        uint32_t                *first_index,
+        int                     tag)
+{
+        int                     nr_found;
+        struct xfs_inode        *ip;
-                /*
+        /*
-                 * use a gang lookup to find the next inode in the tree
+         * use a gang lookup to find the next inode in the tree
-                 * as the tree is sparse and a gang lookup walks to find
+         * as the tree is sparse and a gang lookup walks to find
-                 * the number of objects requested.
+         * the number of objects requested.
-                 */
+         */
-                read_lock(&pag->pag_ici_lock);
+        read_lock(&pag->pag_ici_lock);
+        if (tag == XFS_ICI_NO_TAG) {
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-                                (void**)&ip, first_index, 1);
+                                (void **)&ip, *first_index, 1);
+        } else {
+                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+                                (void **)&ip, *first_index, 1, tag);
+        }
+        if (!nr_found)
+                goto unlock;
-                if (!nr_found) {
+        /*
-                        read_unlock(&pag->pag_ici_lock);
+         * Update the index for the next lookup. Catch overflows
-                        break;
+         * into the next AG range which can occur if we have inodes
-                }
+         * in the last block of the AG and we are currently
+         * pointing to the last inode.
+         */
+        *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+        if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                goto unlock;
-                /*
+        return ip;
-                 * Update the index for the next lookup. Catch overflows
-                 * into the next AG range which can occur if we have inodes
-                 * in the last block of the AG and we are currently
-                 * pointing to the last inode.
-                 */
-                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /* nothing to sync during shutdown */
+unlock:
-                if (XFS_FORCED_SHUTDOWN(mp)) {
+        read_unlock(&pag->pag_ici_lock);
-                        read_unlock(&pag->pag_ici_lock);
+        return NULL;
-                        return 0;
+}
-                }
-                /*
+STATIC int
-                 * If we can't get a reference on the inode, it must be
+xfs_inode_ag_walk(
-                 * in reclaim. Leave it for the reclaim code to flush.
+        struct xfs_mount        *mp,
-                 */
+        xfs_agnumber_t          ag,
-                inode = VFS_I(ip);
+        int                     (*execute)(struct xfs_inode *ip,
-                if (!igrab(inode)) {
+                                           struct xfs_perag *pag, int flags),
-                        read_unlock(&pag->pag_ici_lock);
+        int                     flags,
-                        continue;
+        int                     tag)
-                }
+{
-                read_unlock(&pag->pag_ici_lock);
+        struct xfs_perag        *pag = &mp->m_perag[ag];
+        uint32_t                first_index;
+        int                     last_error = 0;
+        int                     skipped;
-                /* avoid new or bad inodes */
+restart:
-                if (is_bad_inode(inode) ||
+        skipped = 0;
-                    xfs_iflags_test(ip, XFS_INEW)) {
+        first_index = 0;
-                        IRELE(ip);
+        do {
-                        continue;
+                int             error = 0;
-                }
+                xfs_inode_t     *ip;
-                /*
+                ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
-                 * If we have to flush data or wait for I/O completion
+                if (!ip)
-                 * we need to hold the iolock.
+                        break;
-                 */
-                if (flags & SYNC_DELWRI) {
-                        if (VN_DIRTY(inode)) {
-                                if (flags & SYNC_TRYLOCK) {
-                                        if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
-                                                lock_flags |= XFS_IOLOCK_SHARED;
-                                } else {
-                                        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-                                        lock_flags |= XFS_IOLOCK_SHARED;
-                                }
-                                if (lock_flags & XFS_IOLOCK_SHARED) {
-                                        error = xfs_flush_pages(ip, 0, -1,
-                                                        (flags & SYNC_WAIT) ? 0
-                                                                : XFS_B_ASYNC,
-                                                        FI_NONE);
-                                }
-                        }
-                        if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
-                                xfs_ioend_wait(ip);
-                }
-                xfs_ilock(ip, XFS_ILOCK_SHARED);
-                if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
-                        if (flags & SYNC_WAIT) {
-                                xfs_iflock(ip);
-                                if (!xfs_inode_clean(ip))
-                                        error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-                                else
-                                        xfs_ifunlock(ip);
-                        } else if (xfs_iflock_nowait(ip)) {
-                                if (!xfs_inode_clean(ip))
-                                        error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
-                                else
-                                        xfs_ifunlock(ip);
-                        }
-                }
-                xfs_iput(ip, lock_flags);
+                error = execute(ip, pag, flags);
+                if (error == EAGAIN) {
+                        skipped++;
+                        continue;
+                }
                if (error)
                        last_error = error;
                /*
                 * bail out if the filesystem is corrupted.
                 */
                if (error == EFSCORRUPTED)
-                        return XFS_ERROR(error);
+                        break;
-        } while (nr_found);
+        } while (1);
+        if (skipped) {
+                delay(1);
+                goto restart;
+        }
+        xfs_put_perag(mp, pag);
        return last_error;
 }
 int
-xfs_sync_inodes(
+xfs_inode_ag_iterator(
-        xfs_mount_t     *mp,
+        struct xfs_mount        *mp,
-        int             flags)
+        int                     (*execute)(struct xfs_inode *ip,
+                                           struct xfs_perag *pag, int flags),
+        int                     flags,
+        int                     tag)
 {
-        int             error;
+        int                     error = 0;
-        int             last_error;
+        int                     last_error = 0;
-        int             i;
+        xfs_agnumber_t          ag;
-        int             lflags = XFS_LOG_FORCE;
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
+        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-                return 0;
+                if (!mp->m_perag[ag].pag_ici_init)
-        error = 0;
+                        continue;
-        last_error = 0;
+                error = xfs_inode_ag_walk(mp, ag, execute, flags, tag);
+                if (error) {
+                        last_error = error;
+                        if (error == EFSCORRUPTED)
+                                break;
+                }
+        }
+        return XFS_ERROR(last_error);
+}
+/* must be called with pag_ici_lock held and releases it */
+int
+xfs_sync_inode_valid(
+        struct xfs_inode        *ip,
+        struct xfs_perag        *pag)
+{
+        struct inode            *inode = VFS_I(ip);
+        /* nothing to sync during shutdown */
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                read_unlock(&pag->pag_ici_lock);
+                return EFSCORRUPTED;
+        }
+        /*
+         * If we can't get a reference on the inode, it must be in reclaim.
+         * Leave it for the reclaim code to flush. Also avoid inodes that
+         * haven't been fully initialised.
+         */
+        if (!igrab(inode)) {
+                read_unlock(&pag->pag_ici_lock);
+                return ENOENT;
+        }
+        read_unlock(&pag->pag_ici_lock);
+        if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) {
+                IRELE(ip);
+                return ENOENT;
+        }
+        return 0;
+}
+STATIC int
+xfs_sync_inode_data(
+        struct xfs_inode        *ip,
+        struct xfs_perag        *pag,
+        int                     flags)
+{
+        struct inode            *inode = VFS_I(ip);
+        struct address_space *mapping = inode->i_mapping;
+        int                     error = 0;
+        error = xfs_sync_inode_valid(ip, pag);
+        if (error)
+                return error;
+        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+                goto out_wait;
+        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+                if (flags & SYNC_TRYLOCK)
+                        goto out_wait;
+                xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        }
+        error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
+                                0 : XFS_B_ASYNC, FI_NONE);
+        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ out_wait:
        if (flags & SYNC_WAIT)
-                lflags |= XFS_LOG_SYNC;
+                xfs_ioend_wait(ip);
+        IRELE(ip);
+        return error;
+}
-        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+STATIC int
-                if (!mp->m_perag[i].pag_ici_init)
+xfs_sync_inode_attr(
-                        continue;
+        struct xfs_inode        *ip,
-                error = xfs_sync_inodes_ag(mp, i, flags);
+        struct xfs_perag        *pag,
-                if (error)
+        int                     flags)
-                        last_error = error;
+{
-                if (error == EFSCORRUPTED)
+        int                     error = 0;
-                        break;
+        error = xfs_sync_inode_valid(ip, pag);
+        if (error)
+                return error;
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        if (xfs_inode_clean(ip))
+                goto out_unlock;
+        if (!xfs_iflock_nowait(ip)) {
+                if (!(flags & SYNC_WAIT))
+                        goto out_unlock;
+                xfs_iflock(ip);
        }
-        if (flags & SYNC_DELWRI)
-                xfs_log_force(mp, 0, lflags);
-        return XFS_ERROR(last_error);
+        if (xfs_inode_clean(ip)) {
+                xfs_ifunlock(ip);
+                goto out_unlock;
+        }
+        error = xfs_iflush(ip, (flags & SYNC_WAIT) ?
+                           XFS_IFLUSH_SYNC : XFS_IFLUSH_DELWRI);
+ out_unlock:
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        IRELE(ip);
+        return error;
+}
+/*
+ * Write out pagecache data for the whole filesystem.
+ */
+int
+xfs_sync_data(
+        struct xfs_mount        *mp,
+        int                     flags)
+{
+        int                     error;
+        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
+                                      XFS_ICI_NO_TAG);
+        if (error)
+                return XFS_ERROR(error);
+        xfs_log_force(mp, 0,
+                      (flags & SYNC_WAIT) ?
+                       XFS_LOG_FORCE | XFS_LOG_SYNC :
+                       XFS_LOG_FORCE);
+        return 0;
+}
+/*
+ * Write out inode metadata (attributes) for the whole filesystem.
+ */
+int
+xfs_sync_attr(
+        struct xfs_mount        *mp,
+        int                     flags)
+{
+        ASSERT((flags & ~SYNC_WAIT) == 0);
+        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
+                                     XFS_ICI_NO_TAG);
 }
 STATIC int
@@ -252,7 +353,7 @@ xfs_sync_fsdata(
         * If this is xfssyncd() then only sync the superblock if we can
         * lock it without sleeping and it is not pinned.
         */
-        if (flags & SYNC_BDFLUSH) {
+        if (flags & SYNC_TRYLOCK) {
                ASSERT(!(flags & SYNC_WAIT));
                bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
@@ -316,13 +417,13 @@ xfs_quiesce_data(
        int error;
        /* push non-blocking */
-        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+        xfs_sync_data(mp, 0);
-        XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+        xfs_qm_sync(mp, SYNC_TRYLOCK);
        xfs_filestream_flush(mp);
        /* push and block */
-        xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+        xfs_sync_data(mp, SYNC_WAIT);
-        XFS_QM_DQSYNC(mp, SYNC_WAIT);
+        xfs_qm_sync(mp, SYNC_WAIT);
        /* write superblock and hoover up shutdown errors */
        error = xfs_sync_fsdata(mp, 0);
@@ -341,7 +442,7 @@ xfs_quiesce_fs(
        int     count = 0, pincount;
        xfs_flush_buftarg(mp->m_ddev_targp, 0);
-        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+        xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
        /*
         * This loop must run at least twice.  The first instance of the loop
@@ -350,7 +451,7 @@ xfs_quiesce_fs(
         * logged before we can write the unmount record.
         */
        do {
-                xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+                xfs_sync_attr(mp, SYNC_WAIT);
                pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
                if (!pincount) {
                        delay(50);
@@ -433,8 +534,8 @@ xfs_flush_inodes_work(
        void            *arg)
 {
        struct inode    *inode = arg;
-        xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
+        xfs_sync_data(mp, SYNC_TRYLOCK);
-        xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
+        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
        iput(inode);
 }
@@ -465,10 +566,10 @@ xfs_sync_worker(
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
                xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-                xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+                xfs_reclaim_inodes(mp, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
                /* dgc: errors ignored here */
-                error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-                error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+                error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
                if (xfs_log_need_covered(mp))
                        error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
        }
@@ -569,7 +670,7 @@ xfs_reclaim_inode(
                        xfs_ifunlock(ip);
                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
                }
-                return 1;
+                return -EAGAIN;
        }
        __xfs_iflags_set(ip, XFS_IRECLAIM);
        spin_unlock(&ip->i_flags_lock);
@@ -654,101 +755,27 @@ xfs_inode_clear_reclaim_tag(
        xfs_put_perag(mp, pag);
 }
+STATIC int
-STATIC void
+xfs_reclaim_inode_now(
-xfs_reclaim_inodes_ag(
+        struct xfs_inode        *ip,
-        xfs_mount_t     *mp,
+        struct xfs_perag        *pag,
-        int             ag,
+        int                     flags)
-        int             noblock,
-        int             mode)
 {
-        xfs_inode_t     *ip = NULL;
+        /* ignore if already under reclaim */
-        xfs_perag_t     *pag = &mp->m_perag[ag];
+        if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-        int             nr_found;
-        uint32_t        first_index;
-        int             skipped;
-restart:
-        first_index = 0;
-        skipped = 0;
-        do {
-                /*
-                 * use a gang lookup to find the next inode in the tree
-                 * as the tree is sparse and a gang lookup walks to find
-                 * the number of objects requested.
-                 */
-                read_lock(&pag->pag_ici_lock);
-                nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
-                                        (void**)&ip, first_index, 1,
-                                        XFS_ICI_RECLAIM_TAG);
-                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /*
-                 * Update the index for the next lookup. Catch overflows
-                 * into the next AG range which can occur if we have inodes
-                 * in the last block of the AG and we are currently
-                 * pointing to the last inode.
-                 */
-                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /* ignore if already under reclaim */
-                if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                        read_unlock(&pag->pag_ici_lock);
-                        continue;
-                }
-                if (noblock) {
-                        if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
-                                continue;
-                        }
-                        if (xfs_ipincount(ip) ||
-                            !xfs_iflock_nowait(ip)) {
-                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                read_unlock(&pag->pag_ici_lock);
-                                continue;
-                        }
-                }
                read_unlock(&pag->pag_ici_lock);
+                return 0;
-                /*
-                 * hmmm - this is an inode already in reclaim. Do
-                 * we even bother catching it here?
-                 */
-                if (xfs_reclaim_inode(ip, noblock, mode))
-                        skipped++;
-        } while (nr_found);
-        if (skipped) {
-                delay(1);
-                goto restart;
        }
-        return;
+        read_unlock(&pag->pag_ici_lock);
+        return xfs_reclaim_inode(ip, 0, flags);
 }
 int
 xfs_reclaim_inodes(
        xfs_mount_t     *mp,
-        int              noblock,
        int             mode)
 {
-        int             i;
+        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode,
+                                        XFS_ICI_RECLAIM_TAG);
-        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
-                if (!mp->m_perag[i].pag_ici_init)
-                        continue;
-                xfs_reclaim_inodes_ag(mp, i, noblock, mode);
-        }
-        return 0;
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 308d5bf6dfbd..2a10301c99c7 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -29,17 +29,14 @@ typedef struct xfs_sync_work {
        struct completion       *w_completion;
 } xfs_sync_work_t;
-#define SYNC_ATTR               0x0001  /* sync attributes */
+#define SYNC_WAIT               0x0001  /* wait for i/o to complete */
-#define SYNC_DELWRI             0x0002  /* look at delayed writes */
+#define SYNC_TRYLOCK            0x0002  /* only try to lock inodes */
-#define SYNC_WAIT               0x0004  /* wait for i/o to complete */
-#define SYNC_BDFLUSH            0x0008  /* BDFLUSH is calling -- don't block */
-#define SYNC_IOWAIT             0x0010  /* wait for all I/O to complete */
-#define SYNC_TRYLOCK            0x0020  /* only try to lock inodes */
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
-int xfs_sync_inodes(struct xfs_mount *mp, int flags);
+int xfs_sync_attr(struct xfs_mount *mp, int flags);
+int xfs_sync_data(struct xfs_mount *mp, int flags);
 int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
@@ -48,10 +45,16 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
 void xfs_flush_inodes(struct xfs_inode *ip);
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
-int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
                                struct xfs_inode *ip);
+int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+        int flags, int tag);
 #endif
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 964621fde6ed..497c7fb75cc1 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -29,67 +29,6 @@
 #include <linux/xattr.h>
-/*
- * ACL handling.  Should eventually be moved into xfs_acl.c
- */
-static int
-xfs_decode_acl(const char *name)
-{
-        if (strcmp(name, "posix_acl_access") == 0)
-                return _ACL_TYPE_ACCESS;
-        else if (strcmp(name, "posix_acl_default") == 0)
-                return _ACL_TYPE_DEFAULT;
-        return -EINVAL;
-}
-/*
- * Get system extended attributes which at the moment only
- * includes Posix ACLs.
- */
-static int
-xfs_xattr_system_get(struct inode *inode, const char *name,
-                void *buffer, size_t size)
-{
-        int acl;
-        acl = xfs_decode_acl(name);
-        if (acl < 0)
-                return acl;
-        return xfs_acl_vget(inode, buffer, size, acl);
-}
-static int
-xfs_xattr_system_set(struct inode *inode, const char *name,
-                const void *value, size_t size, int flags)
-{
-        int acl;
-        acl = xfs_decode_acl(name);
-        if (acl < 0)
-                return acl;
-        if (flags & XATTR_CREATE)
-                return -EINVAL;
-        if (!value)
-                return xfs_acl_vremove(inode, acl);
-        return xfs_acl_vset(inode, (void *)value, size, acl);
-}
-static struct xattr_handler xfs_xattr_system_handler = {
-        .prefix = XATTR_SYSTEM_PREFIX,
-        .get    = xfs_xattr_system_get,
-        .set    = xfs_xattr_system_set,
-};
-/*
- * Real xattr handling.  The only difference between the namespaces is
- * a flag passed to the low-level attr code.
- */
 static int
 __xfs_xattr_get(struct inode *inode, const char *name,
                void *value, size_t size, int xflags)
@@ -199,7 +138,9 @@ struct xattr_handler *xfs_xattr_handlers[] = {
        &xfs_xattr_user_handler,
        &xfs_xattr_trusted_handler,
        &xfs_xattr_security_handler,
+#ifdef CONFIG_XFS_POSIX_ACL
        &xfs_xattr_system_handler,
+#endif
        NULL
 };
@@ -310,7 +251,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
        /*
         * Then add the two synthetic ACL attributes.
         */
-        if (xfs_acl_vhasacl_access(inode)) {
+        if (posix_acl_access_exists(inode)) {
                error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
                                strlen(POSIX_ACL_XATTR_ACCESS) + 1,
                                data, size, &context.count);
@@ -318,7 +259,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
                        return error;
        }
-        if (xfs_acl_vhasacl_default(inode)) {
+        if (posix_acl_default_exists(inode)) {
                error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
                                strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
                                data, size, &context.count);
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index e4babcc63423..2f3f2229eaaf 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -1194,7 +1193,9 @@ void
 xfs_qm_dqrele(
        xfs_dquot_t     *dqp)
 {
-        ASSERT(dqp);
+        if (!dqp)
+                return;
        xfs_dqtrace_entry(dqp, "DQRELE");
        xfs_dqlock(dqp);
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index de0f402ddb4c..6533ead9b889 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -181,7 +181,6 @@ extern void		xfs_qm_adjust_dqlimits(xfs_mount_t *,
 extern int              xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
                                        xfs_dqid_t, uint, uint, xfs_dquot_t **);
 extern void             xfs_qm_dqput(xfs_dquot_t *);
-extern void             xfs_qm_dqrele(xfs_dquot_t *);
 extern void             xfs_dqlock(xfs_dquot_t *);
 extern void             xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
 extern void             xfs_dqunlock(xfs_dquot_t *);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1728f6a7c4f5..d0d4a9a0bbd7 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 5b6695049e00..45b1bfef7388 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_bmap.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -287,11 +286,13 @@ xfs_qm_rele_quotafs_ref(
 * Just destroy the quotainfo structure.
 */
 void
-xfs_qm_unmount_quotadestroy(
+xfs_qm_unmount(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        if (mp->m_quotainfo)
+        if (mp->m_quotainfo) {
+                xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
                xfs_qm_destroy_quotainfo(mp);
+        }
 }
@@ -385,8 +386,13 @@ xfs_qm_mount_quotas(
        if (error) {
                xfs_fs_cmn_err(CE_WARN, mp,
                        "Failed to initialize disk quotas.");
+                return;
        }
-        return;
+#ifdef QUOTADEBUG
+        if (XFS_IS_QUOTA_ON(mp))
+                xfs_qm_internalqcheck(mp);
+#endif
 }
 /*
@@ -774,12 +780,11 @@ xfs_qm_dqattach_grouphint(
 * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
 * into account.
 * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL.
 * Inode may get unlocked and relocked in here, and the caller must deal with
 * the consequences.
 */
 int
-xfs_qm_dqattach(
+xfs_qm_dqattach_locked(
        xfs_inode_t     *ip,
        uint            flags)
 {
@@ -787,17 +792,14 @@ xfs_qm_dqattach(
        uint            nquotas = 0;
        int             error = 0;
-        if ((! XFS_IS_QUOTA_ON(mp)) ||
+        if (!XFS_IS_QUOTA_RUNNING(mp) ||
-            (! XFS_NOT_DQATTACHED(mp, ip)) ||
+            !XFS_IS_QUOTA_ON(mp) ||
-            (ip->i_ino == mp->m_sb.sb_uquotino) ||
+            !XFS_NOT_DQATTACHED(mp, ip) ||
-            (ip->i_ino == mp->m_sb.sb_gquotino))
+            ip->i_ino == mp->m_sb.sb_uquotino ||
+            ip->i_ino == mp->m_sb.sb_gquotino)
                return 0;
-        ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 ||
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-               xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        if (! (flags & XFS_QMOPT_ILOCKED))
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (XFS_IS_UQUOTA_ON(mp)) {
                error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
@@ -849,8 +851,7 @@ xfs_qm_dqattach(
                xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
        }
-      done:
+ done:
 #ifdef QUOTADEBUG
        if (! error) {
                if (XFS_IS_UQUOTA_ON(mp))
@@ -858,15 +859,22 @@ xfs_qm_dqattach(
                if (XFS_IS_OQUOTA_ON(mp))
                        ASSERT(ip->i_gdquot);
        }
+        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 #endif
+        return error;
+}
-        if (! (flags & XFS_QMOPT_ILOCKED))
+int
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+xfs_qm_dqattach(
+        struct xfs_inode        *ip,
+        uint                    flags)
+{
+        int                     error;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        error = xfs_qm_dqattach_locked(ip, flags);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-#ifdef QUOTADEBUG
-        else
-                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-#endif
        return error;
 }
@@ -896,11 +904,6 @@ xfs_qm_dqdetach(
        }
 }
-/*
- * This is called to sync quotas. We can be told to use non-blocking
- * semantics by either the SYNC_BDFLUSH flag or the absence of the
- * SYNC_WAIT flag.
- */
 int
 xfs_qm_sync(
        xfs_mount_t     *mp,
@@ -909,17 +912,13 @@ xfs_qm_sync(
        int             recl, restarts;
        xfs_dquot_t     *dqp;
        uint            flush_flags;
-        boolean_t       nowait;
        int             error;
-        if (! XFS_IS_QUOTA_ON(mp))
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
+        flush_flags = (flags & SYNC_WAIT) ? XFS_QMOPT_SYNC : XFS_QMOPT_DELWRI;
        restarts = 0;
-        /*
-         * We won't block unless we are asked to.
-         */
-        nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0);
  again:
        xfs_qm_mplist_lock(mp);
@@ -939,18 +938,10 @@ xfs_qm_sync(
                 * don't 'seem' to be dirty. ie. don't acquire dqlock.
                 * This is very similar to what xfs_sync does with inodes.
                 */
-                if (flags & SYNC_BDFLUSH) {
+                if (flags & SYNC_TRYLOCK) {
-                        if (! XFS_DQ_IS_DIRTY(dqp))
+                        if (!XFS_DQ_IS_DIRTY(dqp))
                                continue;
-                }
+                        if (!xfs_qm_dqlock_nowait(dqp))
-                if (nowait) {
-                        /*
-                         * Try to acquire the dquot lock. We are NOT out of
-                         * lock order, but we just don't want to wait for this
-                         * lock, unless somebody wanted us to.
-                         */
-                        if (! xfs_qm_dqlock_nowait(dqp))
                                continue;
                } else {
                        xfs_dqlock(dqp);
@@ -967,7 +958,7 @@ xfs_qm_sync(
                /* XXX a sentinel would be better */
                recl = XFS_QI_MPLRECLAIMS(mp);
                if (!xfs_dqflock_nowait(dqp)) {
-                        if (nowait) {
+                        if (flags & SYNC_TRYLOCK) {
                                xfs_dqunlock(dqp);
                                continue;
                        }
@@ -985,7 +976,6 @@ xfs_qm_sync(
                 * Let go of the mplist lock. We don't want to hold it
                 * across a disk write
                 */
-                flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC;
                xfs_qm_mplist_unlock(mp);
                xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH");
                error = xfs_qm_dqflush(dqp, flush_flags);
@@ -2319,20 +2309,20 @@ xfs_qm_write_sb_changes(
 */
 int
 xfs_qm_vop_dqalloc(
-        xfs_mount_t     *mp,
+        struct xfs_inode        *ip,
-        xfs_inode_t     *ip,
+        uid_t                   uid,
-        uid_t           uid,
+        gid_t                   gid,
-        gid_t           gid,
+        prid_t                  prid,
-        prid_t          prid,
+        uint                    flags,
-        uint            flags,
+        struct xfs_dquot        **O_udqpp,
-        xfs_dquot_t     **O_udqpp,
+        struct xfs_dquot        **O_gdqpp)
-        xfs_dquot_t     **O_gdqpp)
 {
-        int             error;
+        struct xfs_mount        *mp = ip->i_mount;
-        xfs_dquot_t     *uq, *gq;
+        struct xfs_dquot        *uq, *gq;
-        uint            lockflags;
+        int                     error;
+        uint                    lockflags;
-        if (!XFS_IS_QUOTA_ON(mp))
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
        lockflags = XFS_ILOCK_EXCL;
@@ -2346,8 +2336,8 @@ xfs_qm_vop_dqalloc(
         * if necessary. The dquot(s) will not be locked.
         */
        if (XFS_NOT_DQATTACHED(mp, ip)) {
-                if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC |
+                error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
-                                            XFS_QMOPT_ILOCKED))) {
+                if (error) {
                        xfs_iunlock(ip, lockflags);
                        return error;
                }
@@ -2469,6 +2459,7 @@ xfs_qm_vop_chown(
        uint            bfield = XFS_IS_REALTIME_INODE(ip) ?
                                 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
        ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
@@ -2508,13 +2499,13 @@ xfs_qm_vop_chown_reserve(
        xfs_dquot_t     *gdqp,
        uint            flags)
 {
-        int             error;
+        xfs_mount_t     *mp = ip->i_mount;
-        xfs_mount_t     *mp;
        uint            delblks, blkflags, prjflags = 0;
        xfs_dquot_t     *unresudq, *unresgdq, *delblksudq, *delblksgdq;
+        int             error;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-        mp = ip->i_mount;
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        delblks = ip->i_delayed_blks;
@@ -2582,28 +2573,23 @@ xfs_qm_vop_chown_reserve(
 int
 xfs_qm_vop_rename_dqattach(
-        xfs_inode_t     **i_tab)
+        struct xfs_inode        **i_tab)
 {
-        xfs_inode_t     *ip;
+        struct xfs_mount        *mp = i_tab[0]->i_mount;
-        int             i;
+        int                     i;
-        int             error;
-        ip = i_tab[0];
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
-        if (! XFS_IS_QUOTA_ON(ip->i_mount))
                return 0;
-        if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+        for (i = 0; (i < 4 && i_tab[i]); i++) {
-                error = xfs_qm_dqattach(ip, 0);
+                struct xfs_inode        *ip = i_tab[i];
-                if (error)
+                int                     error;
-                        return error;
-        }
-        for (i = 1; (i < 4 && i_tab[i]); i++) {
                /*
                 * Watch out for duplicate entries in the table.
                 */
-                if ((ip = i_tab[i]) != i_tab[i-1]) {
+                if (i == 0 || ip != i_tab[i-1]) {
-                        if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
+                        if (XFS_NOT_DQATTACHED(mp, ip)) {
                                error = xfs_qm_dqattach(ip, 0);
                                if (error)
                                        return error;
@@ -2614,17 +2600,19 @@ xfs_qm_vop_rename_dqattach(
 }
 void
-xfs_qm_vop_dqattach_and_dqmod_newinode(
+xfs_qm_vop_create_dqattach(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_inode_t     *ip,
+        struct xfs_inode        *ip,
-        xfs_dquot_t     *udqp,
+        struct xfs_dquot        *udqp,
-        xfs_dquot_t     *gdqp)
+        struct xfs_dquot        *gdqp)
 {
-        if (!XFS_IS_QUOTA_ON(tp->t_mountp))
+        struct xfs_mount        *mp = tp->t_mountp;
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        if (udqp) {
                xfs_dqlock(udqp);
@@ -2632,7 +2620,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
                xfs_dqunlock(udqp);
                ASSERT(ip->i_udquot == NULL);
                ip->i_udquot = udqp;
-                ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp));
+                ASSERT(XFS_IS_UQUOTA_ON(mp));
                ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
                xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
        }
@@ -2642,8 +2630,8 @@ xfs_qm_vop_dqattach_and_dqmod_newinode(
                xfs_dqunlock(gdqp);
                ASSERT(ip->i_gdquot == NULL);
                ip->i_gdquot = gdqp;
-                ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp));
+                ASSERT(XFS_IS_OQUOTA_ON(mp));
-                ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ?
+                ASSERT((XFS_IS_GQUOTA_ON(mp) ?
                        ip->i_d.di_gid : ip->i_d.di_projid) ==
                                be32_to_cpu(gdqp->q_core.d_id));
                xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index a371954cae1b..495564b8af38 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -127,8 +127,6 @@ typedef struct xfs_quotainfo {
 } xfs_quotainfo_t;
-extern xfs_dqtrxops_t   xfs_trans_dquot_ops;
 extern void     xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
 extern int      xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
                        xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
@@ -159,17 +157,11 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_RTBWARNLIMIT     5
 extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern void             xfs_qm_mount_quotas(xfs_mount_t *);
 extern int              xfs_qm_quotacheck(xfs_mount_t *);
-extern void             xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern void             xfs_qm_unmount_quotas(xfs_mount_t *);
 extern int              xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
-extern int              xfs_qm_sync(xfs_mount_t *, int);
 /* dquot stuff */
 extern boolean_t        xfs_qm_dqalloc_incore(xfs_dquot_t **);
-extern int              xfs_qm_dqattach(xfs_inode_t *, uint);
-extern void             xfs_qm_dqdetach(xfs_inode_t *);
 extern int              xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void             xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
@@ -183,19 +175,6 @@ extern int		xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
 extern int              xfs_qm_scall_quotaon(xfs_mount_t *, uint);
 extern int              xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
-/* vop stuff */
-extern int              xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *,
-                                        uid_t, gid_t, prid_t, uint,
-                                        xfs_dquot_t **, xfs_dquot_t **);
-extern void             xfs_qm_vop_dqattach_and_dqmod_newinode(
-                                        xfs_trans_t *, xfs_inode_t *,
-                                        xfs_dquot_t *, xfs_dquot_t *);
-extern int              xfs_qm_vop_rename_dqattach(xfs_inode_t **);
-extern xfs_dquot_t *    xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *,
-                                        xfs_dquot_t **, xfs_dquot_t *);
-extern int              xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *,
-                                        xfs_dquot_t *, xfs_dquot_t *, uint);
 /* list stuff */
 extern void             xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *);
 extern void             xfs_qm_freelist_unlink(xfs_dquot_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 63037c689a4b..a5346630dfae 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -42,7 +42,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
@@ -84,7 +83,7 @@ xfs_fill_statvfs_from_dquot(
 * return a statvfs of the project, not the entire filesystem.
 * This makes such trees appear as if they are filesystems in themselves.
 */
-STATIC void
+void
 xfs_qm_statvfs(
        xfs_inode_t             *ip,
        struct kstatfs          *statp)
@@ -92,20 +91,13 @@ xfs_qm_statvfs(
        xfs_mount_t             *mp = ip->i_mount;
        xfs_dquot_t             *dqp;
-        if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
-            !((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
-                              (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
-                return;
        if (!xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) {
-                xfs_disk_dquot_t        *dp = &dqp->q_core;
+                xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
-                xfs_fill_statvfs_from_dquot(statp, dp);
                xfs_qm_dqput(dqp);
        }
 }
-STATIC int
+int
 xfs_qm_newmount(
        xfs_mount_t     *mp,
        uint            *needquotamount,
@@ -114,9 +106,6 @@ xfs_qm_newmount(
        uint            quotaondisk;
        uint            uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
-        *quotaflags = 0;
-        *needquotamount = B_FALSE;
        quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
                                (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
@@ -179,66 +168,6 @@ xfs_qm_newmount(
        return 0;
 }
-STATIC int
-xfs_qm_endmount(
-        xfs_mount_t     *mp,
-        uint            needquotamount,
-        uint            quotaflags)
-{
-        if (needquotamount) {
-                ASSERT(mp->m_qflags == 0);
-                mp->m_qflags = quotaflags;
-                xfs_qm_mount_quotas(mp);
-        }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-        if (! (XFS_IS_QUOTA_ON(mp)))
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
-        else
-                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
-#endif
-#ifdef QUOTADEBUG
-        if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp))
-                cmn_err(CE_WARN, "XFS: mount internalqcheck failed");
-#endif
-        return 0;
-}
-STATIC void
-xfs_qm_dqrele_null(
-        xfs_dquot_t     *dq)
-{
-        /*
-         * Called from XFS, where we always check first for a NULL dquot.
-         */
-        if (!dq)
-                return;
-        xfs_qm_dqrele(dq);
-}
-struct xfs_qmops xfs_qmcore_xfs = {
-        .xfs_qminit             = xfs_qm_newmount,
-        .xfs_qmdone             = xfs_qm_unmount_quotadestroy,
-        .xfs_qmmount            = xfs_qm_endmount,
-        .xfs_qmunmount          = xfs_qm_unmount_quotas,
-        .xfs_dqrele             = xfs_qm_dqrele_null,
-        .xfs_dqattach           = xfs_qm_dqattach,
-        .xfs_dqdetach           = xfs_qm_dqdetach,
-        .xfs_dqpurgeall         = xfs_qm_dqpurge_all,
-        .xfs_dqvopalloc         = xfs_qm_vop_dqalloc,
-        .xfs_dqvopcreate        = xfs_qm_vop_dqattach_and_dqmod_newinode,
-        .xfs_dqvoprename        = xfs_qm_vop_rename_dqattach,
-        .xfs_dqvopchown         = xfs_qm_vop_chown,
-        .xfs_dqvopchownresv     = xfs_qm_vop_chown_reserve,
-        .xfs_dqstatvfs          = xfs_qm_statvfs,
-        .xfs_dqsync             = xfs_qm_sync,
-        .xfs_dqtrxops           = &xfs_trans_dquot_ops,
-};
-EXPORT_SYMBOL(xfs_qmcore_xfs);
 void __init
 xfs_qm_init(void)
 {
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 709f5f545cf5..21b08c0396a1 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -42,7 +42,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_qm.h"
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index c7b66f6506ce..4e4276b956e8 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -45,7 +45,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -847,105 +846,55 @@ xfs_qm_export_flags(
 }
-/*
+STATIC int
- * Release all the dquots on the inodes in an AG.
+xfs_dqrele_inode(
- */
+        struct xfs_inode        *ip,
-STATIC void
+        struct xfs_perag        *pag,
-xfs_qm_dqrele_inodes_ag(
+        int                     flags)
-        xfs_mount_t     *mp,
-        int             ag,
-        uint            flags)
 {
-        xfs_inode_t     *ip = NULL;
+        int                     error;
-        xfs_perag_t     *pag = &mp->m_perag[ag];
-        int             first_index = 0;
-        int             nr_found;
-        do {
-                /*
-                 * use a gang lookup to find the next inode in the tree
-                 * as the tree is sparse and a gang lookup walks to find
-                 * the number of objects requested.
-                 */
-                read_lock(&pag->pag_ici_lock);
-                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-                                (void**)&ip, first_index, 1);
-                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /*
-                 * Update the index for the next lookup. Catch overflows
-                 * into the next AG range which can occur if we have inodes
-                 * in the last block of the AG and we are currently
-                 * pointing to the last inode.
-                 */
-                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
-                        read_unlock(&pag->pag_ici_lock);
-                        break;
-                }
-                /* skip quota inodes */
-                if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
-                        ASSERT(ip->i_udquot == NULL);
-                        ASSERT(ip->i_gdquot == NULL);
-                        read_unlock(&pag->pag_ici_lock);
-                        continue;
-                }
-                /*
+        /* skip quota inodes */
-                 * If we can't get a reference on the inode, it must be
+        if (ip == XFS_QI_UQIP(ip->i_mount) || ip == XFS_QI_GQIP(ip->i_mount)) {
-                 * in reclaim. Leave it for the reclaim code to flush.
+                ASSERT(ip->i_udquot == NULL);
-                 */
+                ASSERT(ip->i_gdquot == NULL);
-                if (!igrab(VFS_I(ip))) {
-                        read_unlock(&pag->pag_ici_lock);
-                        continue;
-                }
                read_unlock(&pag->pag_ici_lock);
+                return 0;
+        }
-                /* avoid new inodes though we shouldn't find any here */
+        error = xfs_sync_inode_valid(ip, pag);
-                if (xfs_iflags_test(ip, XFS_INEW)) {
+        if (error)
-                        IRELE(ip);
+                return error;
-                        continue;
-                }
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+        if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
-                        xfs_qm_dqrele(ip->i_udquot);
+                xfs_qm_dqrele(ip->i_udquot);
-                        ip->i_udquot = NULL;
+                ip->i_udquot = NULL;
-                }
+        }
-                if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
+        if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
-                    ip->i_gdquot) {
+                xfs_qm_dqrele(ip->i_gdquot);
-                        xfs_qm_dqrele(ip->i_gdquot);
+                ip->i_gdquot = NULL;
-                        ip->i_gdquot = NULL;
+        }
-                }
+        xfs_iput(ip, XFS_ILOCK_EXCL);
-                xfs_iput(ip, XFS_ILOCK_EXCL);
+        IRELE(ip);
-        } while (nr_found);
+        return 0;
 }
 /*
 * Go thru all the inodes in the file system, releasing their dquots.
+ *
 * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
+ * AFTER this, in the case of quotaoff.
- * xfs_rootumount.
 */
 void
 xfs_qm_dqrele_all_inodes(
        struct xfs_mount *mp,
        uint             flags)
 {
-        int             i;
        ASSERT(mp->m_quotainfo);
-        for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG);
-                if (!mp->m_perag[i].pag_ici_init)
-                        continue;
-                xfs_qm_dqrele_inodes_ag(mp, i, flags);
-        }
 }
 /*------------------------------------------------------------------------*/
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 447173bcf96d..97ac9640be98 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -42,7 +42,6 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_priv.h"
@@ -111,7 +110,7 @@ xfs_trans_log_dquot(
 * Carry forward whatever is left of the quota blk reservation to
 * the spanky new transaction
 */
-STATIC void
+void
 xfs_trans_dup_dqinfo(
        xfs_trans_t     *otp,
        xfs_trans_t     *ntp)
@@ -167,19 +166,17 @@ xfs_trans_dup_dqinfo(
 /*
 * Wrap around mod_dquot to account for both user and group quotas.
 */
-STATIC void
+void
 xfs_trans_mod_dquot_byino(
        xfs_trans_t     *tp,
        xfs_inode_t     *ip,
        uint            field,
        long            delta)
 {
-        xfs_mount_t     *mp;
+        xfs_mount_t     *mp = tp->t_mountp;
-        ASSERT(tp);
-        mp = tp->t_mountp;
-        if (!XFS_IS_QUOTA_ON(mp) ||
+        if (!XFS_IS_QUOTA_RUNNING(mp) ||
+            !XFS_IS_QUOTA_ON(mp) ||
            ip->i_ino == mp->m_sb.sb_uquotino ||
            ip->i_ino == mp->m_sb.sb_gquotino)
                return;
@@ -229,6 +226,7 @@ xfs_trans_mod_dquot(
        xfs_dqtrx_t     *qtrx;
        ASSERT(tp);
+        ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
        qtrx = NULL;
        if (tp->t_dqinfo == NULL)
@@ -346,7 +344,7 @@ xfs_trans_dqlockedjoin(
 * Unreserve just the reservations done by this transaction.
 * dquot is still left locked at exit.
 */
-STATIC void
+void
 xfs_trans_apply_dquot_deltas(
        xfs_trans_t             *tp)
 {
@@ -357,7 +355,7 @@ xfs_trans_apply_dquot_deltas(
        long                    totalbdelta;
        long                    totalrtbdelta;
-        if (! (tp->t_flags & XFS_TRANS_DQ_DIRTY))
+        if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
                return;
        ASSERT(tp->t_dqinfo);
@@ -531,7 +529,7 @@ xfs_trans_apply_dquot_deltas(
 * we simply throw those away, since that's the expected behavior
 * when a transaction is curtailed without a commit.
 */
-STATIC void
+void
 xfs_trans_unreserve_and_mod_dquots(
        xfs_trans_t             *tp)
 {
@@ -768,7 +766,7 @@ xfs_trans_reserve_quota_bydquots(
 {
        int             resvd = 0, error;
-        if (!XFS_IS_QUOTA_ON(mp))
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
        if (tp && tp->t_dqinfo == NULL)
@@ -811,18 +809,17 @@ xfs_trans_reserve_quota_bydquots(
 * This doesn't change the actual usage, just the reservation.
 * The inode sent in is locked.
 */
-STATIC int
+int
 xfs_trans_reserve_quota_nblks(
-        xfs_trans_t     *tp,
+        struct xfs_trans        *tp,
-        xfs_mount_t     *mp,
+        struct xfs_inode        *ip,
-        xfs_inode_t     *ip,
+        long                    nblks,
-        long            nblks,
+        long                    ninos,
-        long            ninos,
+        uint                    flags)
-        uint            flags)
 {
-        int             error;
+        struct xfs_mount        *mp = ip->i_mount;
-        if (!XFS_IS_QUOTA_ON(mp))
+        if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
                return 0;
        if (XFS_IS_PQUOTA_ON(mp))
                flags |= XFS_QMOPT_ENOSPC;
@@ -831,7 +828,6 @@ xfs_trans_reserve_quota_nblks(
        ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-        ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
        ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
                                XFS_TRANS_DQ_RES_RTBLKS ||
               (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
@@ -840,11 +836,9 @@ xfs_trans_reserve_quota_nblks(
        /*
         * Reserve nblks against these dquots, with trans as the mediator.
         */
-        error = xfs_trans_reserve_quota_bydquots(tp, mp,
+        return xfs_trans_reserve_quota_bydquots(tp, mp,
-                                                 ip->i_udquot, ip->i_gdquot,
+                                                ip->i_udquot, ip->i_gdquot,
-                                                 nblks, ninos,
+                                                nblks, ninos, flags);
-                                                 flags);
-        return error;
 }
 /*
@@ -895,25 +889,15 @@ STATIC void
 xfs_trans_alloc_dqinfo(
        xfs_trans_t     *tp)
 {
-        (tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+        tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
 }
-STATIC void
+void
 xfs_trans_free_dqinfo(
        xfs_trans_t     *tp)
 {
        if (!tp->t_dqinfo)
                return;
-        kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo);
+        kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
-        (tp)->t_dqinfo = NULL;
+        tp->t_dqinfo = NULL;
 }
-xfs_dqtrxops_t  xfs_trans_dquot_ops = {
-        .qo_dup_dqinfo                  = xfs_trans_dup_dqinfo,
-        .qo_free_dqinfo                 = xfs_trans_free_dqinfo,
-        .qo_mod_dquot_byino             = xfs_trans_mod_dquot_byino,
-        .qo_apply_dquot_deltas          = xfs_trans_apply_dquot_deltas,
-        .qo_reserve_quota_nblks         = xfs_trans_reserve_quota_nblks,
-        .qo_reserve_quota_bydquots      = xfs_trans_reserve_quota_bydquots,
-        .qo_unreserve_and_mod_dquots    = xfs_trans_unreserve_and_mod_dquots,
-};
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
deleted file mode 100644
index a8cdd73999a4..000000000000
--- a/fs/xfs/xfs_acl.c
+++ /dev/null
@@ -1,874 +0,0 @@
-/*
- * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_vnodeops.h"
-#include <linux/capability.h>
-#include <linux/posix_acl_xattr.h>
-STATIC int      xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
-STATIC void     xfs_acl_filter_mode(mode_t, xfs_acl_t *);
-STATIC void     xfs_acl_get_endian(xfs_acl_t *);
-STATIC int      xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
-STATIC int      xfs_acl_invalid(xfs_acl_t *);
-STATIC void     xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void     xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
-STATIC void     xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
-STATIC int      xfs_acl_allow_set(struct inode *, int);
-kmem_zone_t *xfs_acl_zone;
-/*
- * Test for existence of access ACL attribute as efficiently as possible.
- */
-int
-xfs_acl_vhasacl_access(
-        struct inode    *vp)
-{
-        int             error;
-        xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error);
-        return (error == 0);
-}
-/*
- * Test for existence of default ACL attribute as efficiently as possible.
- */
-int
-xfs_acl_vhasacl_default(
-        struct inode    *vp)
-{
-        int             error;
-        if (!S_ISDIR(vp->i_mode))
-                return 0;
-        xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
-        return (error == 0);
-}
-/*
- * Convert from extended attribute representation to in-memory for XFS.
- */
-STATIC int
-posix_acl_xattr_to_xfs(
-        posix_acl_xattr_header  *src,
-        size_t                  size,
-        xfs_acl_t               *dest)
-{
-        posix_acl_xattr_entry   *src_entry;
-        xfs_acl_entry_t         *dest_entry;
-        int                     n;
-        if (!src || !dest)
-                return EINVAL;
-        if (size < sizeof(posix_acl_xattr_header))
-                return EINVAL;
-        if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-                return EOPNOTSUPP;
-        memset(dest, 0, sizeof(xfs_acl_t));
-        dest->acl_cnt = posix_acl_xattr_count(size);
-        if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES)
-                return EINVAL;
-        /*
-         * acl_set_file(3) may request that we set default ACLs with
-         * zero length -- defend (gracefully) against that here.
-         */
-        if (!dest->acl_cnt)
-                return 0;
-        src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src));
-        dest_entry = &dest->acl_entry[0];
-        for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) {
-                dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm);
-                if (_ACL_PERM_INVALID(dest_entry->ae_perm))
-                        return EINVAL;
-                dest_entry->ae_tag  = le16_to_cpu(src_entry->e_tag);
-                switch(dest_entry->ae_tag) {
-                case ACL_USER:
-                case ACL_GROUP:
-                        dest_entry->ae_id = le32_to_cpu(src_entry->e_id);
-                        break;
-                case ACL_USER_OBJ:
-                case ACL_GROUP_OBJ:
-                case ACL_MASK:
-                case ACL_OTHER:
-                        dest_entry->ae_id = ACL_UNDEFINED_ID;
-                        break;
-                default:
-                        return EINVAL;
-                }
-        }
-        if (xfs_acl_invalid(dest))
-                return EINVAL;
-        return 0;
-}
-/*
- * Comparison function called from xfs_sort().
- * Primary key is ae_tag, secondary key is ae_id.
- */
-STATIC int
-xfs_acl_entry_compare(
-        const void      *va,
-        const void      *vb)
-{
-        xfs_acl_entry_t *a = (xfs_acl_entry_t *)va,
-                        *b = (xfs_acl_entry_t *)vb;
-        if (a->ae_tag == b->ae_tag)
-                return (a->ae_id - b->ae_id);
-        return (a->ae_tag - b->ae_tag);
-}
-/*
- * Convert from in-memory XFS to extended attribute representation.
- */
-STATIC int
-posix_acl_xfs_to_xattr(
-        xfs_acl_t               *src,
-        posix_acl_xattr_header  *dest,
-        size_t                  size)
-{
-        int                     n;
-        size_t                  new_size = posix_acl_xattr_size(src->acl_cnt);
-        posix_acl_xattr_entry   *dest_entry;
-        xfs_acl_entry_t         *src_entry;
-        if (size < new_size)
-                return -ERANGE;
-        /* Need to sort src XFS ACL by <ae_tag,ae_id> */
-        xfs_sort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]),
-                 xfs_acl_entry_compare);
-        dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
-        dest_entry = &dest->a_entries[0];
-        src_entry = &src->acl_entry[0];
-        for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) {
-                dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm);
-                if (_ACL_PERM_INVALID(src_entry->ae_perm))
-                        return -EINVAL;
-                dest_entry->e_tag  = cpu_to_le16(src_entry->ae_tag);
-                switch (src_entry->ae_tag) {
-                case ACL_USER:
-                case ACL_GROUP:
-                        dest_entry->e_id = cpu_to_le32(src_entry->ae_id);
-                                break;
-                case ACL_USER_OBJ:
-                case ACL_GROUP_OBJ:
-                case ACL_MASK:
-                case ACL_OTHER:
-                        dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
-                        break;
-                default:
-                        return -EINVAL;
-                }
-        }
-        return new_size;
-}
-int
-xfs_acl_vget(
-        struct inode    *vp,
-        void            *acl,
-        size_t          size,
-        int             kind)
-{
-        int                     error;
-        xfs_acl_t               *xfs_acl = NULL;
-        posix_acl_xattr_header  *ext_acl = acl;
-        int                     flags = 0;
-        if(size) {
-                if (!(_ACL_ALLOC(xfs_acl))) {
-                        error = ENOMEM;
-                        goto out;
-                }
-                memset(xfs_acl, 0, sizeof(xfs_acl_t));
-        } else
-                flags = ATTR_KERNOVAL;
-        xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error);
-        if (error)
-                goto out;
-        if (!size) {
-                error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES);
-        } else {
-                if (xfs_acl_invalid(xfs_acl)) {
-                        error = EINVAL;
-                        goto out;
-                }
-                if (kind == _ACL_TYPE_ACCESS)
-                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
-                error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
-        }
-out:
-        if(xfs_acl)
-                _ACL_FREE(xfs_acl);
-        return -error;
-}
-int
-xfs_acl_vremove(
-        struct inode    *vp,
-        int             kind)
-{
-        int             error;
-        error = xfs_acl_allow_set(vp, kind);
-        if (!error) {
-                error = xfs_attr_remove(XFS_I(vp),
-                                                kind == _ACL_TYPE_DEFAULT?
-                                                SGI_ACL_DEFAULT: SGI_ACL_FILE,
-                                                ATTR_ROOT);
-                if (error == ENOATTR)
-                        error = 0;      /* 'scool */
-        }
-        return -error;
-}
-int
-xfs_acl_vset(
-        struct inode            *vp,
-        void                    *acl,
-        size_t                  size,
-        int                     kind)
-{
-        posix_acl_xattr_header  *ext_acl = acl;
-        xfs_acl_t               *xfs_acl;
-        int                     error;
-        int                     basicperms = 0; /* more than std unix perms? */
-        if (!acl)
-                return -EINVAL;
-        if (!(_ACL_ALLOC(xfs_acl)))
-                return -ENOMEM;
-        error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl);
-        if (error) {
-                _ACL_FREE(xfs_acl);
-                return -error;
-        }
-        if (!xfs_acl->acl_cnt) {
-                _ACL_FREE(xfs_acl);
-                return 0;
-        }
-        error = xfs_acl_allow_set(vp, kind);
-        /* Incoming ACL exists, set file mode based on its value */
-        if (!error && kind == _ACL_TYPE_ACCESS)
-                error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
-        if (error)
-                goto out;
-        /*
-         * If we have more than std unix permissions, set up the actual attr.
-         * Otherwise, delete any existing attr.  This prevents us from
-         * having actual attrs for permissions that can be stored in the
-         * standard permission bits.
-         */
-        if (!basicperms) {
-                xfs_acl_set_attr(vp, xfs_acl, kind, &error);
-        } else {
-                error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
-        }
-out:
-        _ACL_FREE(xfs_acl);
-        return -error;
-}
-int
-xfs_acl_iaccess(
-        xfs_inode_t     *ip,
-        mode_t          mode,
-        cred_t          *cr)
-{
-        xfs_acl_t       *acl;
-        int             rval;
-        struct xfs_name acl_name = {SGI_ACL_FILE, SGI_ACL_FILE_SIZE};
-        if (!(_ACL_ALLOC(acl)))
-                return -1;
-        /* If the file has no ACL return -1. */
-        rval = sizeof(xfs_acl_t);
-        if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, ATTR_ROOT)) {
-                _ACL_FREE(acl);
-                return -1;
-        }
-        xfs_acl_get_endian(acl);
-        /* If the file has an empty ACL return -1. */
-        if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) {
-                _ACL_FREE(acl);
-                return -1;
-        }
-        /* Synchronize ACL with mode bits */
-        xfs_acl_sync_mode(ip->i_d.di_mode, acl);
-        rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr);
-        _ACL_FREE(acl);
-        return rval;
-}
-STATIC int
-xfs_acl_allow_set(
-        struct inode    *vp,
-        int             kind)
-{
-        if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
-                return EPERM;
-        if (kind == _ACL_TYPE_DEFAULT && !S_ISDIR(vp->i_mode))
-                return ENOTDIR;
-        if (vp->i_sb->s_flags & MS_RDONLY)
-                return EROFS;
-        if (XFS_I(vp)->i_d.di_uid != current_fsuid() && !capable(CAP_FOWNER))
-                return EPERM;
-        return 0;
-}
-/*
- * Note: cr is only used here for the capability check if the ACL test fails.
- *       It is not used to find out the credentials uid or groups etc, as was
- *       done in IRIX. It is assumed that the uid and groups for the current
- *       thread are taken from "current" instead of the cr parameter.
- */
-STATIC int
-xfs_acl_access(
-        uid_t           fuid,
-        gid_t           fgid,
-        xfs_acl_t       *fap,
-        mode_t          md,
-        cred_t          *cr)
-{
-        xfs_acl_entry_t matched;
-        int             i, allows;
-        int             maskallows = -1;        /* true, but not 1, either */
-        int             seen_userobj = 0;
-        matched.ae_tag = 0;     /* Invalid type */
-        matched.ae_perm = 0;
-        for (i = 0; i < fap->acl_cnt; i++) {
-                /*
-                 * Break out if we've got a user_obj entry or
-                 * a user entry and the mask (and have processed USER_OBJ)
-                 */
-                if (matched.ae_tag == ACL_USER_OBJ)
-                        break;
-                if (matched.ae_tag == ACL_USER) {
-                        if (maskallows != -1 && seen_userobj)
-                                break;
-                        if (fap->acl_entry[i].ae_tag != ACL_MASK &&
-                            fap->acl_entry[i].ae_tag != ACL_USER_OBJ)
-                                continue;
-                }
-                /* True if this entry allows the requested access */
-                allows = ((fap->acl_entry[i].ae_perm & md) == md);
-                switch (fap->acl_entry[i].ae_tag) {
-                case ACL_USER_OBJ:
-                        seen_userobj = 1;
-                        if (fuid != current_fsuid())
-                                continue;
-                        matched.ae_tag = ACL_USER_OBJ;
-                        matched.ae_perm = allows;
-                        break;
-                case ACL_USER:
-                        if (fap->acl_entry[i].ae_id != current_fsuid())
-                                continue;
-                        matched.ae_tag = ACL_USER;
-                        matched.ae_perm = allows;
-                        break;
-                case ACL_GROUP_OBJ:
-                        if ((matched.ae_tag == ACL_GROUP_OBJ ||
-                            matched.ae_tag == ACL_GROUP) && !allows)
-                                continue;
-                        if (!in_group_p(fgid))
-                                continue;
-                        matched.ae_tag = ACL_GROUP_OBJ;
-                        matched.ae_perm = allows;
-                        break;
-                case ACL_GROUP:
-                        if ((matched.ae_tag == ACL_GROUP_OBJ ||
-                            matched.ae_tag == ACL_GROUP) && !allows)
-                                continue;
-                        if (!in_group_p(fap->acl_entry[i].ae_id))
-                                continue;
-                        matched.ae_tag = ACL_GROUP;
-                        matched.ae_perm = allows;
-                        break;
-                case ACL_MASK:
-                        maskallows = allows;
-                        break;
-                case ACL_OTHER:
-                        if (matched.ae_tag != 0)
-                                continue;
-                        matched.ae_tag = ACL_OTHER;
-                        matched.ae_perm = allows;
-                        break;
-                }
-        }
-        /*
-         * First possibility is that no matched entry allows access.
-         * The capability to override DAC may exist, so check for it.
-         */
-        switch (matched.ae_tag) {
-        case ACL_OTHER:
-        case ACL_USER_OBJ:
-                if (matched.ae_perm)
-                        return 0;
-                break;
-        case ACL_USER:
-        case ACL_GROUP_OBJ:
-        case ACL_GROUP:
-                if (maskallows && matched.ae_perm)
-                        return 0;
-                break;
-        case 0:
-                break;
-        }
-        /* EACCES tells generic_permission to check for capability overrides */
-        return EACCES;
-}
-/*
- * ACL validity checker.
- *   This acl validation routine checks each ACL entry read in makes sense.
- */
-STATIC int
-xfs_acl_invalid(
-        xfs_acl_t       *aclp)
-{
-        xfs_acl_entry_t *entry, *e;
-        int             user = 0, group = 0, other = 0, mask = 0;
-        int             mask_required = 0;
-        int             i, j;
-        if (!aclp)
-                goto acl_invalid;
-        if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES)
-                goto acl_invalid;
-        for (i = 0; i < aclp->acl_cnt; i++) {
-                entry = &aclp->acl_entry[i];
-                switch (entry->ae_tag) {
-                case ACL_USER_OBJ:
-                        if (user++)
-                                goto acl_invalid;
-                        break;
-                case ACL_GROUP_OBJ:
-                        if (group++)
-                                goto acl_invalid;
-                        break;
-                case ACL_OTHER:
-                        if (other++)
-                                goto acl_invalid;
-                        break;
-                case ACL_USER:
-                case ACL_GROUP:
-                        for (j = i + 1; j < aclp->acl_cnt; j++) {
-                                e = &aclp->acl_entry[j];
-                                if (e->ae_id == entry->ae_id &&
-                                    e->ae_tag == entry->ae_tag)
-                                        goto acl_invalid;
-                        }
-                        mask_required++;
-                        break;
-                case ACL_MASK:
-                        if (mask++)
-                                goto acl_invalid;
-                        break;
-                default:
-                        goto acl_invalid;
-                }
-        }
-        if (!user || !group || !other || (mask_required && !mask))
-                goto acl_invalid;
-        else
-                return 0;
-acl_invalid:
-        return EINVAL;
-}
-/*
- * Do ACL endian conversion.
- */
-STATIC void
-xfs_acl_get_endian(
-        xfs_acl_t       *aclp)
-{
-        xfs_acl_entry_t *ace, *end;
-        INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-        end = &aclp->acl_entry[0]+aclp->acl_cnt;
-        for (ace = &aclp->acl_entry[0]; ace < end; ace++) {
-                INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag);
-                INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id);
-                INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm);
-        }
-}
-/*
- * Get the ACL from the EA and do endian conversion.
- */
-STATIC void
-xfs_acl_get_attr(
-        struct inode    *vp,
-        xfs_acl_t       *aclp,
-        int             kind,
-        int             flags,
-        int             *error)
-{
-        int             len = sizeof(xfs_acl_t);
-        ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
-        flags |= ATTR_ROOT;
-        *error = xfs_attr_get(XFS_I(vp),
-                                        kind == _ACL_TYPE_ACCESS ?
-                                        SGI_ACL_FILE : SGI_ACL_DEFAULT,
-                                        (char *)aclp, &len, flags);
-        if (*error || (flags & ATTR_KERNOVAL))
-                return;
-        xfs_acl_get_endian(aclp);
-}
-/*
- * Set the EA with the ACL and do endian conversion.
- */
-STATIC void
-xfs_acl_set_attr(
-        struct inode    *vp,
-        xfs_acl_t       *aclp,
-        int             kind,
-        int             *error)
-{
-        xfs_acl_entry_t *ace, *newace, *end;
-        xfs_acl_t       *newacl;
-        int             len;
-        if (!(_ACL_ALLOC(newacl))) {
-                *error = ENOMEM;
-                return;
-        }
-        len = sizeof(xfs_acl_t) -
-              (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt));
-        end = &aclp->acl_entry[0]+aclp->acl_cnt;
-        for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0];
-             ace < end;
-             ace++, newace++) {
-                INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag);
-                INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id);
-                INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
-        }
-        INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
-        *error = xfs_attr_set(XFS_I(vp),
-                                kind == _ACL_TYPE_ACCESS ?
-                                SGI_ACL_FILE: SGI_ACL_DEFAULT,
-                                (char *)newacl, len, ATTR_ROOT);
-        _ACL_FREE(newacl);
-}
-int
-xfs_acl_vtoacl(
-        struct inode    *vp,
-        xfs_acl_t       *access_acl,
-        xfs_acl_t       *default_acl)
-{
-        int             error = 0;
-        if (access_acl) {
-                /*
-                 * Get the Access ACL and the mode.  If either cannot
-                 * be obtained for some reason, invalidate the access ACL.
-                 */
-                xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error);
-                if (error)
-                        access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
-                else /* We have a good ACL and the file mode, synchronize. */
-                        xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
-        }
-        if (default_acl) {
-                xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error);
-                if (error)
-                        default_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
-        }
-        return error;
-}
-/*
- * This function retrieves the parent directory's acl, processes it
- * and lets the child inherit the acl(s) that it should.
- */
-int
-xfs_acl_inherit(
-        struct inode    *vp,
-        mode_t          mode,
-        xfs_acl_t       *pdaclp)
-{
-        xfs_acl_t       *cacl;
-        int             error = 0;
-        int             basicperms = 0;
-        /*
-         * If the parent does not have a default ACL, or it's an
-         * invalid ACL, we're done.
-         */
-        if (!vp)
-                return 0;
-        if (!pdaclp || xfs_acl_invalid(pdaclp))
-                return 0;
-        /*
-         * Copy the default ACL of the containing directory to
-         * the access ACL of the new file and use the mode that
-         * was passed in to set up the correct initial values for
-         * the u::,g::[m::], and o:: entries.  This is what makes
-         * umask() "work" with ACL's.
-         */
-        if (!(_ACL_ALLOC(cacl)))
-                return ENOMEM;
-        memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
-        xfs_acl_filter_mode(mode, cacl);
-        error = xfs_acl_setmode(vp, cacl, &basicperms);
-        if (error)
-                goto out_error;
-        /*
-         * Set the Default and Access ACL on the file.  The mode is already
-         * set on the file, so we don't need to worry about that.
-         *
-         * If the new file is a directory, its default ACL is a copy of
-         * the containing directory's default ACL.
-         */
-        if (S_ISDIR(vp->i_mode))
-                xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
-        if (!error && !basicperms)
-                xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
-out_error:
-        _ACL_FREE(cacl);
-        return error;
-}
-/*
- * Set up the correct mode on the file based on the supplied ACL.  This
- * makes sure that the mode on the file reflects the state of the
- * u::,g::[m::], and o:: entries in the ACL.  Since the mode is where
- * the ACL is going to get the permissions for these entries, we must
- * synchronize the mode whenever we set the ACL on a file.
- */
-STATIC int
-xfs_acl_setmode(
-        struct inode    *vp,
-        xfs_acl_t       *acl,
-        int             *basicperms)
-{
-        struct iattr    iattr;
-        xfs_acl_entry_t *ap;
-        xfs_acl_entry_t *gap = NULL;
-        int             i, nomask = 1;
-        *basicperms = 1;
-        if (acl->acl_cnt == XFS_ACL_NOT_PRESENT)
-                return 0;
-        /*
-         * Copy the u::, g::, o::, and m:: bits from the ACL into the
-         * mode.  The m:: bits take precedence over the g:: bits.
-         */
-        iattr.ia_valid = ATTR_MODE;
-        iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
-        iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
-        ap = acl->acl_entry;
-        for (i = 0; i < acl->acl_cnt; ++i) {
-                switch (ap->ae_tag) {
-                case ACL_USER_OBJ:
-                        iattr.ia_mode |= ap->ae_perm << 6;
-                        break;
-                case ACL_GROUP_OBJ:
-                        gap = ap;
-                        break;
-                case ACL_MASK:  /* more than just standard modes */
-                        nomask = 0;
-                        iattr.ia_mode |= ap->ae_perm << 3;
-                        *basicperms = 0;
-                        break;
-                case ACL_OTHER:
-                        iattr.ia_mode |= ap->ae_perm;
-                        break;
-                default:        /* more than just standard modes */
-                        *basicperms = 0;
-                        break;
-                }
-                ap++;
-        }
-        /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
-        if (gap && nomask)
-                iattr.ia_mode |= gap->ae_perm << 3;
-        return xfs_setattr(XFS_I(vp), &iattr, 0);
-}
-/*
- * The permissions for the special ACL entries (u::, g::[m::], o::) are
- * actually stored in the file mode (if there is both a group and a mask,
- * the group is stored in the ACL entry and the mask is stored on the file).
- * This allows the mode to remain automatically in sync with the ACL without
- * the need for a call-back to the ACL system at every point where the mode
- * could change.  This function takes the permissions from the specified mode
- * and places it in the supplied ACL.
- *
- * This implementation draws its validity from the fact that, when the ACL
- * was assigned, the mode was copied from the ACL.
- * If the mode did not change, therefore, the mode remains exactly what was
- * taken from the special ACL entries at assignment.
- * If a subsequent chmod() was done, the POSIX spec says that the change in
- * mode must cause an update to the ACL seen at user level and used for
- * access checks.  Before and after a mode change, therefore, the file mode
- * most accurately reflects what the special ACL entries should permit/deny.
- *
- * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly,
- *         the existing mode bits will override whatever is in the
- *         ACL. Similarly, if there is a pre-existing ACL that was
- *         never in sync with its mode (owing to a bug in 6.5 and
- *         before), it will now magically (or mystically) be
- *         synchronized.  This could cause slight astonishment, but
- *         it is better than inconsistent permissions.
- *
- * The supplied ACL is a template that may contain any combination
- * of special entries.  These are treated as place holders when we fill
- * out the ACL.  This routine does not add or remove special entries, it
- * simply unites each special entry with its associated set of permissions.
- */
-STATIC void
-xfs_acl_sync_mode(
-        mode_t          mode,
-        xfs_acl_t       *acl)
-{
-        int             i, nomask = 1;
-        xfs_acl_entry_t *ap;
-        xfs_acl_entry_t *gap = NULL;
-        /*
-         * Set ACL entries. POSIX1003.1eD16 requires that the MASK
-         * be set instead of the GROUP entry, if there is a MASK.
-         */
-        for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
-                switch (ap->ae_tag) {
-                case ACL_USER_OBJ:
-                        ap->ae_perm = (mode >> 6) & 0x7;
-                        break;
-                case ACL_GROUP_OBJ:
-                        gap = ap;
-                        break;
-                case ACL_MASK:
-                        nomask = 0;
-                        ap->ae_perm = (mode >> 3) & 0x7;
-                        break;
-                case ACL_OTHER:
-                        ap->ae_perm = mode & 0x7;
-                        break;
-                default:
-                        break;
-                }
-        }
-        /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
-        if (gap && nomask)
-                gap->ae_perm = (mode >> 3) & 0x7;
-}
-/*
- * When inheriting an Access ACL from a directory Default ACL,
- * the ACL bits are set to the intersection of the ACL default
- * permission bits and the file permission bits in mode. If there
- * are no permission bits on the file then we must not give them
- * the ACL. This is what what makes umask() work with ACLs.
- */
-STATIC void
-xfs_acl_filter_mode(
-        mode_t          mode,
-        xfs_acl_t       *acl)
-{
-        int             i, nomask = 1;
-        xfs_acl_entry_t *ap;
-        xfs_acl_entry_t *gap = NULL;
-        /*
-         * Set ACL entries. POSIX1003.1eD16 requires that the MASK
-         * be merged with GROUP entry, if there is a MASK.
-         */
-        for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) {
-                switch (ap->ae_tag) {
-                case ACL_USER_OBJ:
-                        ap->ae_perm &= (mode >> 6) & 0x7;
-                        break;
-                case ACL_GROUP_OBJ:
-                        gap = ap;
-                        break;
-                case ACL_MASK:
-                        nomask = 0;
-                        ap->ae_perm &= (mode >> 3) & 0x7;
-                        break;
-                case ACL_OTHER:
-                        ap->ae_perm &= mode & 0x7;
-                        break;
-                default:
-                        break;
-                }
-        }
-        /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */
-        if (gap && nomask)
-                gap->ae_perm &= (mode >> 3) & 0x7;
-}
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 642f1db4def4..63dc1f2efad5 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -18,81 +18,48 @@
 #ifndef __XFS_ACL_H__
 #define __XFS_ACL_H__
-/*
+struct inode;
- * Access Control Lists
+struct posix_acl;
- */
+struct xfs_inode;
-typedef __uint16_t      xfs_acl_perm_t;
-typedef __int32_t       xfs_acl_tag_t;
-typedef __int32_t       xfs_acl_id_t;
 #define XFS_ACL_MAX_ENTRIES 25
 #define XFS_ACL_NOT_PRESENT (-1)
-typedef struct xfs_acl_entry {
+/* On-disk XFS access control list structure */
-        xfs_acl_tag_t   ae_tag;
+struct xfs_acl {
-        xfs_acl_id_t    ae_id;
+        __be32          acl_cnt;
-        xfs_acl_perm_t  ae_perm;
+        struct xfs_acl_entry {
-} xfs_acl_entry_t;
+                __be32  ae_tag;
+                __be32  ae_id;
-typedef struct xfs_acl {
+                __be16  ae_perm;
-        __int32_t       acl_cnt;
+        } acl_entry[XFS_ACL_MAX_ENTRIES];
-        xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES];
+};
-} xfs_acl_t;
 /* On-disk XFS extended attribute names */
-#define SGI_ACL_FILE    "SGI_ACL_FILE"
+#define SGI_ACL_FILE            "SGI_ACL_FILE"
-#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT"
+#define SGI_ACL_DEFAULT         "SGI_ACL_DEFAULT"
 #define SGI_ACL_FILE_SIZE       (sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE    (sizeof(SGI_ACL_DEFAULT)-1)
-#define _ACL_TYPE_ACCESS        1
-#define _ACL_TYPE_DEFAULT       2
 #ifdef CONFIG_XFS_POSIX_ACL
+extern int xfs_check_acl(struct inode *inode, int mask);
+extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
+extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
+extern int xfs_acl_chmod(struct inode *inode);
+extern void xfs_inode_init_acls(struct xfs_inode *ip);
+extern void xfs_inode_clear_acls(struct xfs_inode *ip);
+extern int posix_acl_access_exists(struct inode *inode);
+extern int posix_acl_default_exists(struct inode *inode);
-struct vattr;
+extern struct xattr_handler xfs_xattr_system_handler;
-struct xfs_inode;
-extern struct kmem_zone *xfs_acl_zone;
-#define xfs_acl_zone_init(zone, name)   \
-                (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
-#define xfs_acl_zone_destroy(zone)      kmem_zone_destroy(zone)
-extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
-extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(struct inode *);
-extern int xfs_acl_vhasacl_default(struct inode *);
-extern int xfs_acl_vset(struct inode *, void *, size_t, int);
-extern int xfs_acl_vget(struct inode *, void *, size_t, int);
-extern int xfs_acl_vremove(struct inode *, int);
-#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
-#define _ACL_INHERIT(c,m,d)     (xfs_acl_inherit(c,m,d))
-#define _ACL_GET_ACCESS(pv,pa)  (xfs_acl_vtoacl(pv,pa,NULL) == 0)
-#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0)
-#define _ACL_ACCESS_EXISTS      xfs_acl_vhasacl_access
-#define _ACL_DEFAULT_EXISTS     xfs_acl_vhasacl_default
-#define _ACL_ALLOC(a)           ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP))
-#define _ACL_FREE(a)            ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0)
 #else
-#define xfs_acl_zone_init(zone,name)
+# define xfs_check_acl                                  NULL
-#define xfs_acl_zone_destroy(zone)
+# define xfs_get_acl(inode, type)                       NULL
-#define xfs_acl_vset(v,p,sz,t)  (-EOPNOTSUPP)
+# define xfs_inherit_acl(inode, default_acl)            0
-#define xfs_acl_vget(v,p,sz,t)  (-EOPNOTSUPP)
+# define xfs_acl_chmod(inode)                           0
-#define xfs_acl_vremove(v,t)    (-EOPNOTSUPP)
+# define xfs_inode_init_acls(ip)
-#define xfs_acl_vhasacl_access(v)       (0)
+# define xfs_inode_clear_acls(ip)
-#define xfs_acl_vhasacl_default(v)      (0)
+# define posix_acl_access_exists(inode)                 0
-#define _ACL_ALLOC(a)           (1)     /* successfully allocate nothing */
+# define posix_acl_default_exists(inode)                0
-#define _ACL_FREE(a)            ((void)0)
+#endif /* CONFIG_XFS_POSIX_ACL */
-#define _ACL_INHERIT(c,m,d)     (0)
-#define _ACL_GET_ACCESS(pv,pa)  (0)
-#define _ACL_GET_DEFAULT(pv,pd) (0)
-#define _ACL_ACCESS_EXISTS      (NULL)
-#define _ACL_DEFAULT_EXISTS     (NULL)
-#endif
 #endif  /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index c8641f713caa..f24b50b68d03 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -212,6 +212,8 @@ typedef struct xfs_perag
 /*
 * tags for inode radix tree
 */
+#define XFS_ICI_NO_TAG          (-1)    /* special flag for an untagged lookup
+                                           in xfs_inode_ag_iterator */
 #define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
 #define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 53d5e70d1360..0902249354a0 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -73,28 +73,6 @@ static inline void be64_add_cpu(__be64 *a, __s64 b)
 #endif  /* __KERNEL__ */
-/* do we need conversion? */
-#define ARCH_NOCONVERT 1
-#ifdef XFS_NATIVE_HOST
-# define ARCH_CONVERT   ARCH_NOCONVERT
-#else
-# define ARCH_CONVERT   0
-#endif
-/* generic swapping macros */
-#ifndef HAVE_SWABMACROS
-#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var))))
-#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var))))
-#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var))))
-#endif
-#define INT_SWAP(type, var) \
-    ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \
-    ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \
-    ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \
-    (var))))
 /*
 * get and set integers from potentially unaligned locations
 */
@@ -107,16 +85,6 @@ static inline void be64_add_cpu(__be64 *a, __s64 b)
        ((__u8*)(pointer))[1] = (((value)     ) & 0xff); \
    }
-/* does not return a value */
-#define INT_SET(reference,arch,valueref) \
-    (__builtin_constant_p(valueref) ? \
-        (void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \
-        (void)( \
-            ((reference) = (valueref)), \
-            ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \
-        ) \
-    )
 /*
 * In directories inode numbers are stored as unaligned arrays of unsigned
 * 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 5fde1654b430..db15feb906ff 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -45,7 +45,6 @@
 #include "xfs_error.h"
 #include "xfs_quota.h"
 #include "xfs_trans_space.h"
-#include "xfs_acl.h"
 #include "xfs_rw.h"
 #include "xfs_vnodeops.h"
@@ -249,8 +248,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        /*
         * Attach the dquots to the inode.
         */
-        if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
+        error = xfs_qm_dqattach(dp, 0);
-                return (error);
+        if (error)
+                return error;
        /*
         * If the inode doesn't have an attribute fork, add one.
@@ -311,7 +311,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL);
-        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0,
+        error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                                       XFS_QMOPT_RES_REGBLKS);
        if (error) {
@@ -501,8 +501,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
        /*
         * Attach the dquots to the inode.
         */
-        if ((error = XFS_QM_DQATTACH(mp, dp, 0)))
+        error = xfs_qm_dqattach(dp, 0);
-                return (error);
+        if (error)
+                return error;
        /*
         * Start our first transaction of the day.
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index ca7c6005a487..7928b9983c1d 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2691,7 +2691,7 @@ xfs_bmap_rtalloc(
                 * Adjust the disk quota also. This was reserved
                 * earlier.
                 */
-                XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
+                xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
                        ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
                                        XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
        } else {
@@ -2995,7 +2995,7 @@ xfs_bmap_btalloc(
                 * Adjust the disk quota also. This was reserved
                 * earlier.
                 */
-                XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip,
+                xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
                        ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT :
                                        XFS_TRANS_DQ_BCOUNT,
                        (long) args.len);
@@ -3066,7 +3066,7 @@ xfs_bmap_btree_to_extents(
                return error;
        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
        ip->i_d.di_nblocks--;
-        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
        xfs_trans_binval(tp, cbp);
        if (cur->bc_bufs[0] == cbp)
                cur->bc_bufs[0] = NULL;
@@ -3386,7 +3386,7 @@ xfs_bmap_del_extent(
         * Adjust quota data.
         */
        if (qfield)
-                XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, qfield, (long)-nblks);
+                xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
        /*
         * Account for change in delayed indirect blocks.
@@ -3523,7 +3523,7 @@ xfs_bmap_extents_to_btree(
        *firstblock = cur->bc_private.b.firstblock = args.fsbno;
        cur->bc_private.b.allocated++;
        ip->i_d.di_nblocks++;
-        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
+        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
        abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
        /*
         * Fill in the child block.
@@ -3690,7 +3690,7 @@ xfs_bmap_local_to_extents(
                XFS_BMAP_TRACE_POST_UPDATE("new", ip, 0, whichfork);
                XFS_IFORK_NEXT_SET(ip, whichfork, 1);
                ip->i_d.di_nblocks = 1;
-                XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
+                xfs_trans_mod_dquot_byino(tp, ip,
                        XFS_TRANS_DQ_BCOUNT, 1L);
                flags |= xfs_ilog_fext(whichfork);
        } else {
@@ -4048,7 +4048,7 @@ xfs_bmap_add_attrfork(
                        XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT)))
                goto error0;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, blks, 0, rsvd ?
+        error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
                        XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                        XFS_QMOPT_RES_REGBLKS);
        if (error) {
@@ -4983,10 +4983,11 @@ xfs_bmapi(
                                 * adjusted later.  We return if we haven't
                                 * allocated blocks already inside this loop.
                                 */
-                                if ((error = XFS_TRANS_RESERVE_QUOTA_NBLKS(
+                                error = xfs_trans_reserve_quota_nblks(
-                                                mp, NULL, ip, (long)alen, 0,
+                                                NULL, ip, (long)alen, 0,
                                                rt ? XFS_QMOPT_RES_RTBLKS :
-                                                     XFS_QMOPT_RES_REGBLKS))) {
+                                                     XFS_QMOPT_RES_REGBLKS);
+                                if (error) {
                                        if (n == 0) {
                                                *nmap = 0;
                                                ASSERT(cur == NULL);
@@ -5035,8 +5036,8 @@ xfs_bmapi(
                                        if (XFS_IS_QUOTA_ON(mp))
                                                /* unreserve the blocks now */
                                                (void)
-                                                XFS_TRANS_UNRESERVE_QUOTA_NBLKS(
+                                                xfs_trans_unreserve_quota_nblks(
-                                                        mp, NULL, ip,
+                                                        NULL, ip,
                                                        (long)alen, 0, rt ?
                                                        XFS_QMOPT_RES_RTBLKS :
                                                        XFS_QMOPT_RES_REGBLKS);
@@ -5691,14 +5692,14 @@ xfs_bunmapi(
                                do_div(rtexts, mp->m_sb.sb_rextsize);
                                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
                                                (int64_t)rtexts, rsvd);
-                                (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
+                                (void)xfs_trans_reserve_quota_nblks(NULL,
-                                        NULL, ip, -((long)del.br_blockcount), 0,
+                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
                                xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS,
                                                (int64_t)del.br_blockcount, rsvd);
-                                (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp,
+                                (void)xfs_trans_reserve_quota_nblks(NULL,
-                                        NULL, ip, -((long)del.br_blockcount), 0,
+                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_REGBLKS);
                        }
                        ip->i_delayed_blks -= del.br_blockcount;
@@ -6085,6 +6086,7 @@ xfs_getbmap(
                        break;
        }
+        kmem_free(out);
        return error;
 }
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 0760d352586f..5c1ade06578e 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -590,7 +590,7 @@ xfs_bmbt_alloc_block(
        cur->bc_private.b.allocated++;
        cur->bc_private.b.ip->i_d.di_nblocks++;
        xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+        xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip,
                        XFS_TRANS_DQ_BCOUNT, 1L);
        new->l = cpu_to_be64(args.fsbno);
@@ -618,7 +618,7 @@ xfs_bmbt_free_block(
        ip->i_d.di_nblocks--;
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+        xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
        xfs_trans_binval(tp, bp);
        return 0;
 }
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 6c87c8f304ef..edf8bdf4141f 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -542,10 +542,8 @@ xfs_filestream_associate(
         * waiting for the lock because someone else is waiting on the lock we
         * hold and we cannot drop that as we are in a transaction here.
         *
-         * Lucky for us, this inversion is rarely a problem because it's a
+         * Lucky for us, this inversion is not a problem because it's a
-         * directory inode that we are trying to lock here and that means the
+         * directory inode that we are trying to lock here.
-         * only place that matters is xfs_sync_inodes() and SYNC_DELWRI is
-         * used. i.e. freeze, remount-ro, quotasync or unmount.
         *
         * So, if we can't get the iolock without sleeping then just give up
         */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f7c06fac8229..c4ea51b55dce 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -239,10 +239,13 @@ typedef struct xfs_fsop_resblks {
 * Minimum and maximum sizes need for growth checks
 */
 #define XFS_MIN_AG_BLOCKS       64
-#define XFS_MIN_LOG_BLOCKS      512
+#define XFS_MIN_LOG_BLOCKS      512ULL
-#define XFS_MAX_LOG_BLOCKS      (64 * 1024)
+#define XFS_MAX_LOG_BLOCKS      (1024 * 1024ULL)
-#define XFS_MIN_LOG_BYTES       (256 * 1024)
+#define XFS_MIN_LOG_BYTES       (10 * 1024 * 1024ULL)
-#define XFS_MAX_LOG_BYTES       (128 * 1024 * 1024)
+/* keep the maximum size under 2^31 by a small amount */
+#define XFS_MAX_LOG_BYTES \
+        ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
 /*
 * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 89b81eedce6a..76c540f719e4 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
+#include "xfs_acl.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
@@ -82,6 +83,7 @@ xfs_inode_alloc(
        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
        ip->i_size = 0;
        ip->i_new_size = 0;
+        xfs_inode_init_acls(ip);
        /*
         * Initialize inode's trace buffers.
@@ -500,10 +502,7 @@ xfs_ireclaim(
         * ilock one but will still hold the iolock.
         */
        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        /*
+        xfs_qm_dqdetach(ip);
-         * Release dquots (and their references) if any.
-         */
-        XFS_QM_DQDETACH(ip->i_mount, ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
        switch (ip->i_d.di_mode & S_IFMT) {
@@ -561,6 +560,7 @@ xfs_ireclaim(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        xfs_inode_clear_acls(ip);
        kmem_zone_free(xfs_inode_zone, ip);
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 123b20c8cbf2..1f22d65fed0a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -49,7 +49,6 @@
 #include "xfs_utils.h"
 #include "xfs_dir2_trace.h"
 #include "xfs_quota.h"
-#include "xfs_acl.h"
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f879c1bc4b96..77016702938b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -18,6 +18,7 @@
 #ifndef __XFS_INODE_H__
 #define __XFS_INODE_H__
+struct posix_acl;
 struct xfs_dinode;
 struct xfs_inode;
@@ -272,6 +273,11 @@ typedef struct xfs_inode {
        /* VFS inode */
        struct inode            i_vnode;        /* embedded VFS inode */
+#ifdef CONFIG_XFS_POSIX_ACL
+        struct posix_acl        *i_acl;
+        struct posix_acl        *i_default_acl;
+#endif
        /* Trace buffers per inode. */
 #ifdef XFS_INODE_TRACE
        struct ktrace           *i_trace;       /* general inode trace */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 5aaa2d7ec155..67ae5555a30a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -42,7 +42,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_trans_space.h"
@@ -385,7 +384,7 @@ xfs_iomap_write_direct(
         * Make sure that the dquots are there. This doesn't hold
         * the ilock across a disk read.
         */
-        error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED);
+        error = xfs_qm_dqattach_locked(ip, 0);
        if (error)
                return XFS_ERROR(error);
@@ -444,8 +443,7 @@ xfs_iomap_write_direct(
        if (error)
                goto error_out;
-        error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
+        error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
-                                              qblocks, 0, quota_flag);
        if (error)
                goto error1;
@@ -495,7 +493,7 @@ xfs_iomap_write_direct(
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
        xfs_bmap_cancel(&free_list);
-        XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
+        xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -582,7 +580,7 @@ xfs_iomap_write_delay(
         * Make sure that the dquots are there. This doesn't hold
         * the ilock across a disk read.
         */
-        error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+        error = xfs_qm_dqattach_locked(ip, 0);
        if (error)
                return XFS_ERROR(error);
@@ -684,7 +682,8 @@ xfs_iomap_write_allocate(
        /*
         * Make sure that the dquots are there.
         */
-        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+        error = xfs_qm_dqattach(ip, 0);
+        if (error)
                return XFS_ERROR(error);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7ba450116d4f..47da2fb45377 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1975,16 +1975,30 @@ xlog_recover_do_reg_buffer(
                error = 0;
                if (buf_f->blf_flags &
                   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                        if (item->ri_buf[i].i_addr == NULL) {
+                                cmn_err(CE_ALERT,
+                                        "XFS: NULL dquot in %s.", __func__);
+                                goto next;
+                        }
+                        if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) {
+                                cmn_err(CE_ALERT,
+                                        "XFS: dquot too small (%d) in %s.",
+                                        item->ri_buf[i].i_len, __func__);
+                                goto next;
+                        }
                        error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
                                               item->ri_buf[i].i_addr,
                                               -1, 0, XFS_QMOPT_DOWARN,
                                               "dquot_buf_recover");
+                        if (error)
+                                goto next;
                }
-                if (!error)
-                        memcpy(xfs_buf_offset(bp,
+                memcpy(xfs_buf_offset(bp,
-                                (uint)bit << XFS_BLI_SHIFT),    /* dest */
+                        (uint)bit << XFS_BLI_SHIFT),    /* dest */
-                                item->ri_buf[i].i_addr,         /* source */
+                        item->ri_buf[i].i_addr,         /* source */
-                                nbits<<XFS_BLI_SHIFT);          /* length */
+                        nbits<<XFS_BLI_SHIFT);          /* length */
+ next:
                i++;
                bit += nbits;
        }
@@ -2615,7 +2629,19 @@ xlog_recover_do_dquot_trans(
                return (0);
        recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
-        ASSERT(recddq);
+        if (item->ri_buf[1].i_addr == NULL) {
+                cmn_err(CE_ALERT,
+                        "XFS: NULL dquot in %s.", __func__);
+                return XFS_ERROR(EIO);
+        }
+        if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) {
+                cmn_err(CE_ALERT,
+                        "XFS: dquot too small (%d) in %s.",
+                        item->ri_buf[1].i_len, __func__);
+                return XFS_ERROR(EIO);
+        }
        /*
         * This type of quotas was turned off, so ignore this record.
         */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 65a99725d0cc..5c6f092659c1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -960,6 +960,53 @@ xfs_check_sizes(xfs_mount_t *mp)
 }
 /*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+int
+xfs_mount_reset_sbqflags(
+        struct xfs_mount        *mp)
+{
+        int                     error;
+        struct xfs_trans        *tp;
+        mp->m_qflags = 0;
+        /*
+         * It is OK to look at sb_qflags here in mount path,
+         * without m_sb_lock.
+         */
+        if (mp->m_sb.sb_qflags == 0)
+                return 0;
+        spin_lock(&mp->m_sb_lock);
+        mp->m_sb.sb_qflags = 0;
+        spin_unlock(&mp->m_sb_lock);
+        /*
+         * If the fs is readonly, let the incore superblock run
+         * with quotas off but don't flush the update out to disk
+         */
+        if (mp->m_flags & XFS_MOUNT_RDONLY)
+                return 0;
+#ifdef QUOTADEBUG
+        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+#endif
+        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                      XFS_DEFAULT_LOG_COUNT);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                xfs_fs_cmn_err(CE_ALERT, mp,
+                        "xfs_mount_reset_sbqflags: Superblock update failed!");
+                return error;
+        }
+        xfs_mod_sb(tp, XFS_SB_QFLAGS);
+        return xfs_trans_commit(tp, 0);
+}
+/*
 * This function does the following on an initial mount of a file system:
 *      - reads the superblock from disk and init the mount struct
 *      - if we're a 32-bit kernel, do a size check on the superblock
@@ -976,7 +1023,8 @@ xfs_mountfs(
        xfs_sb_t        *sbp = &(mp->m_sb);
        xfs_inode_t     *rip;
        __uint64_t      resblks;
-        uint            quotamount, quotaflags;
+        uint            quotamount = 0;
+        uint            quotaflags = 0;
        int             error = 0;
        xfs_mount_common(mp, sbp);
@@ -1210,9 +1258,28 @@ xfs_mountfs(
        /*
         * Initialise the XFS quota management subsystem for this mount
         */
-        error = XFS_QM_INIT(mp, &quotamount, &quotaflags);
+        if (XFS_IS_QUOTA_RUNNING(mp)) {
-        if (error)
+                error = xfs_qm_newmount(mp, &quotamount, &quotaflags);
-                goto out_rtunmount;
+                if (error)
+                        goto out_rtunmount;
+        } else {
+                ASSERT(!XFS_IS_QUOTA_ON(mp));
+                /*
+                 * If a file system had quotas running earlier, but decided to
+                 * mount without -o uquota/pquota/gquota options, revoke the
+                 * quotachecked license.
+                 */
+                if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
+                        cmn_err(CE_NOTE,
+                                "XFS: resetting qflags for filesystem %s",
+                                mp->m_fsname);
+                        error = xfs_mount_reset_sbqflags(mp);
+                        if (error)
+                                return error;
+                }
+        }
        /*
         * Finish recovering the file system.  This part needed to be
@@ -1228,9 +1295,19 @@ xfs_mountfs(
        /*
         * Complete the quota initialisation, post-log-replay component.
         */
-        error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
+        if (quotamount) {
-        if (error)
+                ASSERT(mp->m_qflags == 0);
-                goto out_rtunmount;
+                mp->m_qflags = quotaflags;
+                xfs_qm_mount_quotas(mp);
+        }
+#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
+        if (XFS_IS_QUOTA_ON(mp))
+                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on");
+        else
+                xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on");
+#endif
        /*
         * Now we are mounted, reserve a small amount of unused space for
@@ -1279,12 +1356,7 @@ xfs_unmountfs(
        __uint64_t              resblks;
        int                     error;
-        /*
+        xfs_qm_unmount_quotas(mp);
-         * Release dquot that rootinode, rbmino and rsumino might be holding,
-         * and release the quota inodes.
-         */
-        XFS_QM_UNMOUNT(mp);
        xfs_rtunmount_inodes(mp);
        IRELE(mp->m_rootip);
@@ -1299,12 +1371,9 @@ xfs_unmountfs(
         * need to force the log first.
         */
        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-        xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
+        xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
-        XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
-        if (mp->m_quotainfo)
+        xfs_qm_unmount(mp);
-                XFS_QM_DONE(mp);
        /*
         * Flush out the log synchronously so that we know for sure
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index d6a64392f983..a5122382afde 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -64,6 +64,8 @@ struct xfs_swapext;
 struct xfs_mru_cache;
 struct xfs_nameops;
 struct xfs_ail;
+struct xfs_quotainfo;
 /*
 * Prototypes and functions for the Data Migration subsystem.
@@ -107,86 +109,6 @@ typedef struct xfs_dmops {
        (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
-/*
- * Prototypes and functions for the Quota Management subsystem.
- */
-struct xfs_dquot;
-struct xfs_dqtrxops;
-struct xfs_quotainfo;
-typedef int     (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
-typedef int     (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
-typedef void    (*xfs_qmunmount_t)(struct xfs_mount *);
-typedef void    (*xfs_qmdone_t)(struct xfs_mount *);
-typedef void    (*xfs_dqrele_t)(struct xfs_dquot *);
-typedef int     (*xfs_dqattach_t)(struct xfs_inode *, uint);
-typedef void    (*xfs_dqdetach_t)(struct xfs_inode *);
-typedef int     (*xfs_dqpurgeall_t)(struct xfs_mount *, uint);
-typedef int     (*xfs_dqvopalloc_t)(struct xfs_mount *,
-                        struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-                        struct xfs_dquot **, struct xfs_dquot **);
-typedef void    (*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *,
-                        struct xfs_dquot *, struct xfs_dquot *);
-typedef int     (*xfs_dqvoprename_t)(struct xfs_inode **);
-typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
-                        struct xfs_trans *, struct xfs_inode *,
-                        struct xfs_dquot **, struct xfs_dquot *);
-typedef int     (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
-                        struct xfs_dquot *, struct xfs_dquot *, uint);
-typedef void    (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
-typedef int     (*xfs_dqsync_t)(struct xfs_mount *, int flags);
-typedef struct xfs_qmops {
-        xfs_qminit_t            xfs_qminit;
-        xfs_qmdone_t            xfs_qmdone;
-        xfs_qmmount_t           xfs_qmmount;
-        xfs_qmunmount_t         xfs_qmunmount;
-        xfs_dqrele_t            xfs_dqrele;
-        xfs_dqattach_t          xfs_dqattach;
-        xfs_dqdetach_t          xfs_dqdetach;
-        xfs_dqpurgeall_t        xfs_dqpurgeall;
-        xfs_dqvopalloc_t        xfs_dqvopalloc;
-        xfs_dqvopcreate_t       xfs_dqvopcreate;
-        xfs_dqvoprename_t       xfs_dqvoprename;
-        xfs_dqvopchown_t        xfs_dqvopchown;
-        xfs_dqvopchownresv_t    xfs_dqvopchownresv;
-        xfs_dqstatvfs_t         xfs_dqstatvfs;
-        xfs_dqsync_t            xfs_dqsync;
-        struct xfs_dqtrxops     *xfs_dqtrxops;
-} xfs_qmops_t;
-#define XFS_QM_INIT(mp, mnt, fl) \
-        (*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
-#define XFS_QM_MOUNT(mp, mnt, fl) \
-        (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
-#define XFS_QM_UNMOUNT(mp) \
-        (*(mp)->m_qm_ops->xfs_qmunmount)(mp)
-#define XFS_QM_DONE(mp) \
-        (*(mp)->m_qm_ops->xfs_qmdone)(mp)
-#define XFS_QM_DQRELE(mp, dq) \
-        (*(mp)->m_qm_ops->xfs_dqrele)(dq)
-#define XFS_QM_DQATTACH(mp, ip, fl) \
-        (*(mp)->m_qm_ops->xfs_dqattach)(ip, fl)
-#define XFS_QM_DQDETACH(mp, ip) \
-        (*(mp)->m_qm_ops->xfs_dqdetach)(ip)
-#define XFS_QM_DQPURGEALL(mp, fl) \
-        (*(mp)->m_qm_ops->xfs_dqpurgeall)(mp, fl)
-#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, prid, fl, dq1, dq2) \
-        (*(mp)->m_qm_ops->xfs_dqvopalloc)(mp, ip, uid, gid, prid, fl, dq1, dq2)
-#define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \
-        (*(mp)->m_qm_ops->xfs_dqvopcreate)(tp, ip, dq1, dq2)
-#define XFS_QM_DQVOPRENAME(mp, ip) \
-        (*(mp)->m_qm_ops->xfs_dqvoprename)(ip)
-#define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \
-        (*(mp)->m_qm_ops->xfs_dqvopchown)(tp, ip, dqp, dq)
-#define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \
-        (*(mp)->m_qm_ops->xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl)
-#define XFS_QM_DQSTATVFS(ip, statp) \
-        (*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
-#define XFS_QM_DQSYNC(mp, flags) \
-        (*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
 #ifdef HAVE_PERCPU_SB
 /*
@@ -510,8 +432,6 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 extern int      xfs_dmops_get(struct xfs_mount *);
 extern void     xfs_dmops_put(struct xfs_mount *);
-extern int      xfs_qmops_get(struct xfs_mount *);
-extern void     xfs_qmops_put(struct xfs_mount *);
 extern struct xfs_dmops xfs_dmcore_xfs;
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
deleted file mode 100644
index e101790ea8e7..000000000000
--- a/fs/xfs/xfs_qmops.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_quota.h"
-#include "xfs_error.h"
-STATIC struct xfs_dquot *
-xfs_dqvopchown_default(
-        struct xfs_trans        *tp,
-        struct xfs_inode        *ip,
-        struct xfs_dquot        **dqp,
-        struct xfs_dquot        *dq)
-{
-        return NULL;
-}
-/*
- * Clear the quotaflags in memory and in the superblock.
- */
-int
-xfs_mount_reset_sbqflags(xfs_mount_t *mp)
-{
-        int                     error;
-        xfs_trans_t             *tp;
-        mp->m_qflags = 0;
-        /*
-         * It is OK to look at sb_qflags here in mount path,
-         * without m_sb_lock.
-         */
-        if (mp->m_sb.sb_qflags == 0)
-                return 0;
-        spin_lock(&mp->m_sb_lock);
-        mp->m_sb.sb_qflags = 0;
-        spin_unlock(&mp->m_sb_lock);
-        /*
-         * if the fs is readonly, let the incore superblock run
-         * with quotas off but don't flush the update out to disk
-         */
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
-                return 0;
-#ifdef QUOTADEBUG
-        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
-#endif
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-        if ((error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-                                      XFS_DEFAULT_LOG_COUNT))) {
-                xfs_trans_cancel(tp, 0);
-                xfs_fs_cmn_err(CE_ALERT, mp,
-                        "xfs_mount_reset_sbqflags: Superblock update failed!");
-                return error;
-        }
-        xfs_mod_sb(tp, XFS_SB_QFLAGS);
-        error = xfs_trans_commit(tp, 0);
-        return error;
-}
-STATIC int
-xfs_noquota_init(
-        xfs_mount_t     *mp,
-        uint            *needquotamount,
-        uint            *quotaflags)
-{
-        int             error = 0;
-        *quotaflags = 0;
-        *needquotamount = B_FALSE;
-        ASSERT(!XFS_IS_QUOTA_ON(mp));
-        /*
-         * If a file system had quotas running earlier, but decided to
-         * mount without -o uquota/pquota/gquota options, revoke the
-         * quotachecked license.
-         */
-        if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
-                cmn_err(CE_NOTE,
-                        "XFS resetting qflags for filesystem %s",
-                        mp->m_fsname);
-                error = xfs_mount_reset_sbqflags(mp);
-        }
-        return error;
-}
-static struct xfs_qmops xfs_qmcore_stub = {
-        .xfs_qminit             = (xfs_qminit_t) xfs_noquota_init,
-        .xfs_qmdone             = (xfs_qmdone_t) fs_noerr,
-        .xfs_qmmount            = (xfs_qmmount_t) fs_noerr,
-        .xfs_qmunmount          = (xfs_qmunmount_t) fs_noerr,
-        .xfs_dqrele             = (xfs_dqrele_t) fs_noerr,
-        .xfs_dqattach           = (xfs_dqattach_t) fs_noerr,
-        .xfs_dqdetach           = (xfs_dqdetach_t) fs_noerr,
-        .xfs_dqpurgeall         = (xfs_dqpurgeall_t) fs_noerr,
-        .xfs_dqvopalloc         = (xfs_dqvopalloc_t) fs_noerr,
-        .xfs_dqvopcreate        = (xfs_dqvopcreate_t) fs_noerr,
-        .xfs_dqvoprename        = (xfs_dqvoprename_t) fs_noerr,
-        .xfs_dqvopchown         = xfs_dqvopchown_default,
-        .xfs_dqvopchownresv     = (xfs_dqvopchownresv_t) fs_noerr,
-        .xfs_dqstatvfs          = (xfs_dqstatvfs_t) fs_noval,
-        .xfs_dqsync             = (xfs_dqsync_t) fs_noerr,
-};
-int
-xfs_qmops_get(struct xfs_mount *mp)
-{
-        if (XFS_IS_QUOTA_RUNNING(mp)) {
-#ifdef CONFIG_XFS_QUOTA
-                mp->m_qm_ops = &xfs_qmcore_xfs;
-#else
-                cmn_err(CE_WARN,
-                        "XFS: qouta support not available in this kernel.");
-                return EINVAL;
-#endif
-        } else {
-                mp->m_qm_ops = &xfs_qmcore_stub;
-        }
-        return 0;
-}
-void
-xfs_qmops_put(struct xfs_mount *mp)
-{
-}
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index f5d1202dde25..3ec91ac74c2a 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -197,7 +197,6 @@ typedef struct xfs_qoff_logformat {
 #define XFS_QMOPT_UMOUNTING     0x0000100 /* filesys is being unmounted */
 #define XFS_QMOPT_DOLOG         0x0000200 /* log buf changes (in quotacheck) */
 #define XFS_QMOPT_DOWARN        0x0000400 /* increase warning cnt if needed */
-#define XFS_QMOPT_ILOCKED       0x0000800 /* inode is already locked (excl) */
 #define XFS_QMOPT_DQREPAIR      0x0001000 /* repair dquot if damaged */
 #define XFS_QMOPT_GQUOTA        0x0002000 /* group dquot requested */
 #define XFS_QMOPT_ENOSPC        0x0004000 /* enospc instead of edquot (prj) */
@@ -302,69 +301,79 @@ typedef struct xfs_dqtrx {
        long            qt_delrtb_delta;  /* delayed RT blk count changes */
 } xfs_dqtrx_t;
-/*
+#ifdef CONFIG_XFS_QUOTA
- * Dquot transaction functions, used if quota is enabled.
+extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
- */
+extern void xfs_trans_free_dqinfo(struct xfs_trans *);
-typedef void    (*qo_dup_dqinfo_t)(struct xfs_trans *, struct xfs_trans *);
+extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
-typedef void    (*qo_mod_dquot_byino_t)(struct xfs_trans *,
+                uint, long);
-                                struct xfs_inode *, uint, long);
+extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
-typedef void    (*qo_free_dqinfo_t)(struct xfs_trans *);
+extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
-typedef void    (*qo_apply_dquot_deltas_t)(struct xfs_trans *);
+extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
-typedef void    (*qo_unreserve_and_mod_dquots_t)(struct xfs_trans *);
+                struct xfs_inode *, long, long, uint);
-typedef int     (*qo_reserve_quota_nblks_t)(
+extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
-                                struct xfs_trans *, struct xfs_mount *,
+                struct xfs_mount *, struct xfs_dquot *,
-                                struct xfs_inode *, long, long, uint);
+                struct xfs_dquot *, long, long, uint);
-typedef int     (*qo_reserve_quota_bydquots_t)(
-                                struct xfs_trans *, struct xfs_mount *,
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint,
-                                struct xfs_dquot *, struct xfs_dquot *,
+                struct xfs_dquot **, struct xfs_dquot **);
-                                long, long, uint);
+extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
-typedef struct xfs_dqtrxops {
+                struct xfs_dquot *, struct xfs_dquot *);
-        qo_dup_dqinfo_t                 qo_dup_dqinfo;
+extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
-        qo_free_dqinfo_t                qo_free_dqinfo;
+extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
-        qo_mod_dquot_byino_t            qo_mod_dquot_byino;
+                struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *);
-        qo_apply_dquot_deltas_t         qo_apply_dquot_deltas;
+extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
-        qo_reserve_quota_nblks_t        qo_reserve_quota_nblks;
+                struct xfs_dquot *, struct xfs_dquot *, uint);
-        qo_reserve_quota_bydquots_t     qo_reserve_quota_bydquots;
+extern int xfs_qm_dqattach(struct xfs_inode *, uint);
-        qo_unreserve_and_mod_dquots_t   qo_unreserve_and_mod_dquots;
+extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
-} xfs_dqtrxops_t;
+extern void xfs_qm_dqdetach(struct xfs_inode *);
+extern void xfs_qm_dqrele(struct xfs_dquot *);
-#define XFS_DQTRXOP(mp, tp, op, args...) \
+extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
-                ((mp)->m_qm_ops->xfs_dqtrxops ? \
+extern int xfs_qm_sync(struct xfs_mount *, int);
-                ((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : 0)
+extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *);
+extern void xfs_qm_mount_quotas(struct xfs_mount *);
-#define XFS_DQTRXOP_VOID(mp, tp, op, args...) \
+extern void xfs_qm_unmount(struct xfs_mount *);
-                ((mp)->m_qm_ops->xfs_dqtrxops ? \
+extern void xfs_qm_unmount_quotas(struct xfs_mount *);
-                ((mp)->m_qm_ops->xfs_dqtrxops->op)(tp, ## args) : (void)0)
+#else
-#define XFS_TRANS_DUP_DQINFO(mp, otp, ntp) \
+static inline int
-        XFS_DQTRXOP_VOID(mp, otp, qo_dup_dqinfo, ntp)
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid,
-#define XFS_TRANS_FREE_DQINFO(mp, tp) \
+                uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp)
-        XFS_DQTRXOP_VOID(mp, tp, qo_free_dqinfo)
+{
-#define XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, field, delta) \
+        *udqp = NULL;
-        XFS_DQTRXOP_VOID(mp, tp, qo_mod_dquot_byino, ip, field, delta)
+        *gdqp = NULL;
-#define XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp) \
+        return 0;
-        XFS_DQTRXOP_VOID(mp, tp, qo_apply_dquot_deltas)
+}
-#define XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, fl) \
+#define xfs_trans_dup_dqinfo(tp, tp2)
-        XFS_DQTRXOP(mp, tp, qo_reserve_quota_nblks, mp, ip, nblks, ninos, fl)
+#define xfs_trans_free_dqinfo(tp)
-#define XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, fl) \
+#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta)
-        XFS_DQTRXOP(mp, tp, qo_reserve_quota_bydquots, mp, ud, gd, nb, ni, fl)
+#define xfs_trans_apply_dquot_deltas(tp)
-#define XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp) \
+#define xfs_trans_unreserve_and_mod_dquots(tp)
-        XFS_DQTRXOP_VOID(mp, tp, qo_unreserve_and_mod_dquots)
+#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags)      (0)
+#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl)      (0)
-#define XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, flags) \
+#define xfs_qm_vop_create_dqattach(tp, ip, u, g)
-        XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), -(ninos), flags)
+#define xfs_qm_vop_rename_dqattach(it)                                  (0)
-#define XFS_TRANS_RESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
+#define xfs_qm_vop_chown(tp, ip, old, new)                              (NULL)
-        XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, \
+#define xfs_qm_vop_chown_reserve(tp, ip, u, g, fl)                      (0)
-                                f | XFS_QMOPT_RES_REGBLKS)
+#define xfs_qm_dqattach(ip, fl)                                         (0)
-#define XFS_TRANS_UNRESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \
+#define xfs_qm_dqattach_locked(ip, fl)                                  (0)
-        XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, -(nb), -(ni), \
+#define xfs_qm_dqdetach(ip)
+#define xfs_qm_dqrele(d)
+#define xfs_qm_statvfs(ip, s)
+#define xfs_qm_sync(mp, fl)                                             (0)
+#define xfs_qm_newmount(mp, a, b)                                       (0)
+#define xfs_qm_mount_quotas(mp)
+#define xfs_qm_unmount(mp)
+#define xfs_qm_unmount_quotas(mp)                                       (0)
+#endif /* CONFIG_XFS_QUOTA */
+#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \
+        xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags)
+#define xfs_trans_reserve_quota(tp, mp, ud, gd, nb, ni, f) \
+        xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
                                f | XFS_QMOPT_RES_REGBLKS)
 extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
 extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
-extern struct xfs_qmops xfs_qmcore_xfs;
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 58f85e9cd11d..b81deea0ce19 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -166,7 +166,8 @@ xfs_rename(
        /*
         * Attach the dquots to the inodes
         */
-        if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) {
+        error = xfs_qm_vop_rename_dqattach(inodes);
+        if (error) {
                xfs_trans_cancel(tp, cancel_flags);
                goto std_return;
        }
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 36f3a21c54d2..fea68615ed23 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -41,7 +41,6 @@
 #include "xfs_ialloc.h"
 #include "xfs_attr.h"
 #include "xfs_bmap.h"
-#include "xfs_acl.h"
 #include "xfs_error.h"
 #include "xfs_buf_item.h"
 #include "xfs_rw.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8570b826fedd..66b849358e62 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -297,7 +297,7 @@ xfs_trans_dup(
        tp->t_rtx_res = tp->t_rtx_res_used;
        ntp->t_pflags = tp->t_pflags;
-        XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp);
+        xfs_trans_dup_dqinfo(tp, ntp);
        atomic_inc(&tp->t_mountp->m_active_trans);
        return ntp;
@@ -628,8 +628,6 @@ xfs_trans_apply_sb_deltas(
                xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
                                  offsetof(xfs_dsb_t, sb_frextents) +
                                  sizeof(sbp->sb_frextents) - 1);
-        tp->t_mountp->m_super->s_dirt = 1;
 }
 /*
@@ -831,7 +829,7 @@ shut_us_down:
                 * means is that we have some (non-persistent) quota
                 * reservations that need to be unreserved.
                 */
-                XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
+                xfs_trans_unreserve_and_mod_dquots(tp);
                if (tp->t_ticket) {
                        commit_lsn = xfs_log_done(mp, tp->t_ticket,
                                                        NULL, log_flags);
@@ -850,10 +848,9 @@ shut_us_down:
        /*
         * If we need to update the superblock, then do it now.
         */
-        if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+        if (tp->t_flags & XFS_TRANS_SB_DIRTY)
                xfs_trans_apply_sb_deltas(tp);
-        }
+        xfs_trans_apply_dquot_deltas(tp);
-        XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp);
        /*
         * Ask each log item how many log_vector entries it will
@@ -1058,7 +1055,7 @@ xfs_trans_uncommit(
        }
        xfs_trans_unreserve_and_mod_sb(tp);
-        XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp);
+        xfs_trans_unreserve_and_mod_dquots(tp);
        xfs_trans_free_items(tp, flags);
        xfs_trans_free_busy(tp);
@@ -1183,7 +1180,7 @@ xfs_trans_cancel(
        }
 #endif
        xfs_trans_unreserve_and_mod_sb(tp);
-        XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp);
+        xfs_trans_unreserve_and_mod_dquots(tp);
        if (tp->t_ticket) {
                if (flags & XFS_TRANS_RELEASE_LOG_RES) {
@@ -1213,7 +1210,7 @@ xfs_trans_free(
        xfs_trans_t     *tp)
 {
        atomic_dec(&tp->t_mountp->m_active_trans);
-        XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
+        xfs_trans_free_dqinfo(tp);
        kmem_zone_free(xfs_trans_zone, tp);
 }
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 79b9e5ea5359..4d88616bde91 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -166,7 +166,7 @@ xfs_dir_ialloc(
                        xfs_buf_relse(ialloc_context);
                        if (dqinfo) {
                                tp->t_dqinfo = dqinfo;
-                                XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp);
+                                xfs_trans_free_dqinfo(tp);
                        }
                        *tpp = ntp;
                        *ipp = NULL;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 19cf90a9c762..c4eca5ed5dab 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -42,6 +42,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_bmap.h"
+#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_rw.h"
 #include "xfs_error.h"
@@ -118,7 +119,7 @@ xfs_setattr(
                 */
                ASSERT(udqp == NULL);
                ASSERT(gdqp == NULL);
-                code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid,
+                code = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
                                         qflags, &udqp, &gdqp);
                if (code)
                        return code;
@@ -180,10 +181,11 @@ xfs_setattr(
                 * Do a quota reservation only if uid/gid is actually
                 * going to change.
                 */
-                if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+                if (XFS_IS_QUOTA_RUNNING(mp) &&
-                    (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
+                    ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+                     (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
                        ASSERT(tp);
-                        code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
                                                capable(CAP_FOWNER) ?
                                                XFS_QMOPT_FORCE_RES : 0);
                        if (code)       /* out of quota */
@@ -217,7 +219,7 @@ xfs_setattr(
                /*
                 * Make sure that the dquots are attached to the inode.
                 */
-                code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+                code = xfs_qm_dqattach_locked(ip, 0);
                if (code)
                        goto error_return;
@@ -351,21 +353,21 @@ xfs_setattr(
                 * in the transaction.
                 */
                if (iuid != uid) {
-                        if (XFS_IS_UQUOTA_ON(mp)) {
+                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
                                ASSERT(mask & ATTR_UID);
                                ASSERT(udqp);
-                                olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+                                olddquot1 = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_udquot, udqp);
                        }
                        ip->i_d.di_uid = uid;
                        inode->i_uid = uid;
                }
                if (igid != gid) {
-                        if (XFS_IS_GQUOTA_ON(mp)) {
+                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
                                ASSERT(!XFS_IS_PQUOTA_ON(mp));
                                ASSERT(mask & ATTR_GID);
                                ASSERT(gdqp);
-                                olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+                                olddquot2 = xfs_qm_vop_chown(tp, ip,
                                                        &ip->i_gdquot, gdqp);
                        }
                        ip->i_d.di_gid = gid;
@@ -461,13 +463,25 @@ xfs_setattr(
        /*
         * Release any dquot(s) the inode had kept before chown.
         */
-        XFS_QM_DQRELE(mp, olddquot1);
+        xfs_qm_dqrele(olddquot1);
-        XFS_QM_DQRELE(mp, olddquot2);
+        xfs_qm_dqrele(olddquot2);
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
-        if (code) {
+        if (code)
                return code;
+        /*
+         * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
+         *           update.  We could avoid this with linked transactions
+         *           and passing down the transaction pointer all the way
+         *           to attr_set.  No previous user of the generic
+         *           Posix ACL code seems to care about this issue either.
+         */
+        if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
+                code = -xfs_acl_chmod(inode);
+                if (code)
+                        return XFS_ERROR(code);
        }
        if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
@@ -482,8 +496,8 @@ xfs_setattr(
        commit_flags |= XFS_TRANS_ABORT;
        /* FALLTHROUGH */
 error_return:
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        if (tp) {
                xfs_trans_cancel(tp, commit_flags);
        }
@@ -739,7 +753,8 @@ xfs_free_eofblocks(
                /*
                 * Attach the dquots to the inode up front.
                 */
-                if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+                error = xfs_qm_dqattach(ip, 0);
+                if (error)
                        return error;
                /*
@@ -1181,7 +1196,8 @@ xfs_inactive(
        ASSERT(ip->i_d.di_nlink == 0);
-        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+        error = xfs_qm_dqattach(ip, 0);
+        if (error)
                return VN_INACTIVE_CACHE;
        tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
@@ -1307,7 +1323,7 @@ xfs_inactive(
                /*
                 * Credit the quota account(s). The inode is gone.
                 */
-                XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+                xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
                /*
                 * Just ignore errors at this point.  There is nothing we can
@@ -1323,11 +1339,11 @@ xfs_inactive(
                        xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
                                "xfs_trans_commit() returned error %d", error);
        }
        /*
         * Release the dquots held by inode, if any.
         */
-        XFS_QM_DQDETACH(mp, ip);
+        xfs_qm_dqdetach(ip);
        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 out:
@@ -1427,8 +1443,7 @@ xfs_create(
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
-        error = XFS_QM_DQVOPALLOC(mp, dp,
+        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -1489,7 +1504,7 @@ xfs_create(
        /*
         * Reserve disk quota and the inode.
         */
-        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
+        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
        if (error)
                goto out_trans_cancel;
@@ -1561,7 +1576,7 @@ xfs_create(
         * These ids of the inode couldn't have changed since the new
         * inode has been locked ever since it was created.
         */
-        XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
+        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
        /*
         * xfs_trans_commit normally decrements the vnode ref count
@@ -1580,8 +1595,8 @@ xfs_create(
                goto out_dqrele;
        }
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        *ipp = ip;
@@ -1602,8 +1617,8 @@ xfs_create(
 out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
 out_dqrele:
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -1837,11 +1852,11 @@ xfs_remove(
                        return error;
        }
-        error = XFS_QM_DQATTACH(mp, dp, 0);
+        error = xfs_qm_dqattach(dp, 0);
        if (error)
                goto std_return;
-        error = XFS_QM_DQATTACH(mp, ip, 0);
+        error = xfs_qm_dqattach(ip, 0);
        if (error)
                goto std_return;
@@ -2028,11 +2043,11 @@ xfs_link(
        /* Return through std_return after this point. */
-        error = XFS_QM_DQATTACH(mp, sip, 0);
+        error = xfs_qm_dqattach(sip, 0);
        if (error)
                goto std_return;
-        error = XFS_QM_DQATTACH(mp, tdp, 0);
+        error = xfs_qm_dqattach(tdp, 0);
        if (error)
                goto std_return;
@@ -2205,8 +2220,7 @@ xfs_symlink(
        /*
         * Make sure that we have allocated dquot(s) on disk.
         */
-        error = XFS_QM_DQVOPALLOC(mp, dp,
+        error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
-                        current_fsuid(), current_fsgid(), prid,
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
        if (error)
                goto std_return;
@@ -2248,7 +2262,7 @@ xfs_symlink(
        /*
         * Reserve disk quota : blocks and inode.
         */
-        error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
+        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, resblks, 1, 0);
        if (error)
                goto error_return;
@@ -2288,7 +2302,7 @@ xfs_symlink(
        /*
         * Also attach the dquot(s) to it, if applicable.
         */
-        XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
+        xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
        if (resblks)
                resblks -= XFS_IALLOC_SPACE_RES(mp);
@@ -2376,8 +2390,8 @@ xfs_symlink(
                goto error2;
        }
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        /* Fall through to std_return with error = 0 or errno from
         * xfs_trans_commit     */
@@ -2401,8 +2415,8 @@ std_return:
        cancel_flags |= XFS_TRANS_ABORT;
 error_return:
        xfs_trans_cancel(tp, cancel_flags);
-        XFS_QM_DQRELE(mp, udqp);
+        xfs_qm_dqrele(udqp);
-        XFS_QM_DQRELE(mp, gdqp);
+        xfs_qm_dqrele(gdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
@@ -2541,7 +2555,8 @@ xfs_alloc_file_space(
        if (XFS_FORCED_SHUTDOWN(mp))
                return XFS_ERROR(EIO);
-        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+        error = xfs_qm_dqattach(ip, 0);
+        if (error)
                return error;
        if (len <= 0)
@@ -2628,8 +2643,8 @@ retry:
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
+                error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
-                                                      qblocks, 0, quota_flag);
+                                                      0, quota_flag);
                if (error)
                        goto error1;
@@ -2688,7 +2703,7 @@ dmapi_enospc_check:
 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
        xfs_bmap_cancel(&free_list);
-        XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
+        xfs_trans_unreserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
 error1: /* Just cancel transaction */
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
@@ -2827,7 +2842,8 @@ xfs_free_file_space(
        xfs_itrace_entry(ip);
-        if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
+        error = xfs_qm_dqattach(ip, 0);
+        if (error)
                return error;
        error = 0;
@@ -2953,9 +2969,9 @@ xfs_free_file_space(
                        break;
                }
                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
+                error = xfs_trans_reserve_quota(tp, mp,
-                                ip->i_udquot, ip->i_gdquot, resblks, 0,
+                                ip->i_udquot, ip->i_gdquot,
-                                XFS_QMOPT_RES_REGBLKS);
+                                resblks, 0, XFS_QMOPT_RES_REGBLKS);
                if (error)
                        goto error1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 04373c6c61ff..a9e102de71a1 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
 #define XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK         0x04    /* Don't grab any conflicting locks */
+#define XFS_ATTR_NOACL          0x08    /* Don't call xfs_acl_chmod */
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_fsync(struct xfs_inode *ip);